Esempio n. 1
0
def table_to_blockmanager(options,
                          table,
                          memory_pool,
                          nthreads=1,
                          categoricals=None):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'],
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name']) for x in columns_metadata)
        columns_values = [
            columns_name_dict[y] if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [(level, col_index.get('numpy_type', level.dtype))
                         for level, col_index in zip_longest(
                             levels, column_indexes, fillvalue={})]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]

        columns = pd.MultiIndex(levels=new_levels,
                                labels=labels,
                                names=columns.names)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Esempio n. 2
0
 def test_make_index_list_multiple_different_levels(self):
     idx = self.sel.make_index([['foo', [0, 1, 2]], ['bar']])
     assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'],
                                                   [0, 1, 2, '']],
                                           labels=[[1, 1, 1, 0],
                                                   [0, 1, 2, 3]]))
Esempio n. 3
0
 def test_get_index_list(self):
     idx = self.sel.get_index(self.df, [['foo', 'mof', '*']])
     assert_index_equal(
         idx,
         pd.MultiIndex(levels=[['foo'], ['mof'], [0, 1, 2]],
                       labels=[[0, 0, 0], [0, 0, 0], [0, 1, 2]]))
Esempio n. 4
0
                    'spam', 'ham', 'ham', 'spam', 'spam',
                    'ham', 'ham', 'spam', 'ham', 'ham',
                    'ham', 'spam', 'ham', 'spam', 'spam']

ac = Counter(actual_labels)
pc = Counter(predicted_labels)

print( 'Actual counts:', ac.most_common())
print ('Predicted counts:', pc.most_common())

cm = metrics.confusion_matrix(y_true=actual_labels,
                              y_pred=predicted_labels,
                              labels=['spam', 'ham'])
print (pd.DataFrame(data=cm,
                   columns=pd.MultiIndex(levels=[['Predicted:'],
                                                 ['spam', 'ham']],
                                         labels=[[0, 0], [0, 1]]),
                   index=pd.MultiIndex(levels=[['Actual:'],
                                               ['spam', 'ham']],
                                       labels=[[0, 0], [0, 1]]))
)

#zande

positive_class = 'spam'

true_positive = 5.
false_positive = 6.
false_negative = 5.
true_negative = 4.
Esempio n. 5
0
 def test_make_index_str_multiple_different_levels(self):
     idx = self.sel.make_index('/foo[0:3],/bar')
     assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'],
                                                   [0, 1, 2, '']],
                                           labels=[[1, 1, 1, 0],
                                                   [0, 1, 2, 3]]))
Esempio n. 6
0
def test_filter_meta_index(test_df):
    obs = test_df.filter(scenario="scen_b").meta.index
    exp = pd.MultiIndex(levels=[["model_a"], ["scen_b"]],
                        codes=[[0], [0]],
                        names=["model", "scenario"])
    pd.testing.assert_index_equal(obs, exp)
Esempio n. 7
0
def ds_to_df(inputs):
    """
    Function that converts the dispaset data format into a dictionary of dataframes

    :param inputs: input file
    :return: dictionary of dataframes
    """

    sets, parameters = inputs['sets'], inputs['parameters']

    # config = parameters['Config']['val']
    try:
        config = inputs['config']
        first_day = pd.datetime(config['StartDate'][0], config['StartDate'][1],
                                config['StartDate'][2], 0)
        last_day = pd.datetime(config['StopDate'][0], config['StopDate'][1],
                               config['StopDate'][2], 23)
        dates = pd.date_range(start=first_day, end=last_day, freq='1h')
        timeindex = True
    except:
        logging.warn(
            'Could not find the start/stop date information in the inputs. Using an integer index'
        )
        dates = range(1, len(sets['z']) + 1)
        timeindex = False
    if len(dates) > len(sets['h']):
        logging.error('The provided index has a length of ' + str(len(dates)) +
                      ' while the data only comprises ' + str(len(sets['h'])) +
                      ' time elements')
        sys.exit(1)
    elif len(dates) > len(sets['z']):
        logging.warn('The provided index has a length of ' + str(len(dates)) +
                     ' while the simulation was designed for ' +
                     str(len(sets['z'])) + ' time elements')
    elif len(dates) < len(sets['z']):
        logging.warn('The provided index has a length of ' + str(len(dates)) +
                     ' while the simulation was designed for ' +
                     str(len(sets['z'])) + ' time elements')

    idx = range(len(dates))

    out = {}
    out['sets'] = sets

    # Printing each parameter in a separate sheet and workbook:
    for p in parameters:
        var = parameters[p]
        dim = len(var['sets'])
        if var['sets'][-1] == 'h' and timeindex and dim > 1:
            # if len(dates) != var['val'].shape[-1]:
            #    sys.exit('The date range in the Config variable (' + str(len(dates)) + ' time steps) does not match the length of the time index (' + str(var['val'].shape[-1]) + ') for variable ' + p)
            var['firstrow'] = 5
        else:
            var['firstrow'] = 1
        if dim == 1:
            if var['sets'][0] == 'h':
                out[p] = pd.DataFrame(var['val'][idx],
                                      columns=[p],
                                      index=dates)
            else:
                out[p] = pd.DataFrame(var['val'],
                                      columns=[p],
                                      index=sets[var['sets'][0]])
        elif dim == 2:
            values = var['val']
            list_sets = [sets[var['sets'][0]], sets[var['sets'][1]]]
            if var['sets'][1] == 'h':
                out[p] = pd.DataFrame(values.transpose()[idx, :],
                                      index=dates,
                                      columns=list_sets[0])
            else:
                out[p] = pd.DataFrame(values.transpose(),
                                      index=list_sets[1],
                                      columns=list_sets[0])
        elif dim == 3:
            list_sets = [
                sets[var['sets'][0]], sets[var['sets'][1]],
                sets[var['sets'][2]]
            ]
            values = var['val']
            values2 = np.zeros(
                [len(list_sets[0]) * len(list_sets[1]),
                 len(list_sets[2])])
            cols = np.zeros([2, len(list_sets[0]) * len(list_sets[1])])
            for i in range(len(list_sets[0])):
                values2[i * len(list_sets[1]):(i + 1) *
                        len(list_sets[1]), :] = values[i, :, :]
                cols[0, i * len(list_sets[1]):(i + 1) * len(list_sets[1])] = i
                cols[1, i * len(list_sets[1]):(i + 1) *
                     len(list_sets[1])] = range(len(list_sets[1]))

            columns = pd.MultiIndex([list_sets[0], list_sets[1]], cols)
            if var['sets'][2] == 'h':
                out[p] = pd.DataFrame(values2.transpose()[idx, :],
                                      index=dates,
                                      columns=columns)
            else:
                out[p] = pd.DataFrame(values2.transpose(),
                                      index=list_sets[2],
                                      columns=columns)
        else:
            logging.error(
                'Only three dimensions currently supported. Parameter ' + p +
                ' has ' + str(dim) + ' dimensions.')
            sys.exit(1)
    return out
    def read_daily_impressions_insights_into_df(self, year, month):
        df_page = pd.DataFrame(
            columns=pd.MultiIndex(levels=[[], []], labels=[[], []]))

        # The number of people who had any content from your Page or about your Page enter their screen. This includes
        # posts, check-ins, ads, social information from people who interact with your Page and more. (Unique Users).
        # Daily
        unique = self.daily_insights_for_month("page_impressions_unique", year,
                                               month)
        end_time, value = list(), list()

        for item in unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["impressions_unique", "value"] = pd.Series(data=value,
                                                           index=end_time,
                                                           name="value")

        # The number of times any content from your Page or about your Page entered a person's screen.
        # This includes posts, check-ins, ads, social information from people who interact with your Page and more.
        # (Total Count). Daily
        impressions = self.daily_insights_for_month("page_impressions", year,
                                                    month)
        end_time, value = list(), list()

        for item in impressions["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["impressions", "value"] = pd.Series(data=value,
                                                    index=end_time,
                                                    name="value")

        # Total number of people who saw a story about your Page by story type. (Unique Users). Daily
        by_story_unique = self.daily_insights_for_month(
            "page_impressions_by_story_type_unique", year, month)
        end_time, mention, other, fan, page_post, checkin = (
            list(),
            list(),
            list(),
            list(),
            list(),
            list(),
        )

        for item in by_story_unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                mention.append(item["value"]["mention"])
            except KeyError:
                mention.append(0)
            try:
                other.append(item["value"]["other"])
            except KeyError:
                other.append(0)
            try:
                fan.append(item["value"]["fan"])
            except KeyError:
                fan.append(0)
            try:
                page_post.append(item["value"]["page post"])
            except KeyError:
                page_post.append(0)
            try:
                checkin.append(item["value"]["checkin"])
            except KeyError:
                checkin.append(0)

        df_page["impressions_by_story_unique",
                "mention"] = pd.Series(data=mention,
                                       index=end_time,
                                       name="mention")
        df_page["impressions_by_story_unique",
                "other"] = pd.Series(data=other, index=end_time, name="other")
        df_page["impressions_by_story_unique",
                "fan"] = pd.Series(data=fan, index=end_time, name="fan")
        df_page["impressions_by_story_unique",
                "page_post"] = pd.Series(data=page_post,
                                         index=end_time,
                                         name="page_post")
        df_page["impressions_by_story_unique",
                "checkin"] = pd.Series(data=checkin,
                                       index=end_time,
                                       name="checkin")

        #  Total impressions of stories published by a friend about your Page by story type. (Total Count). Daily
        by_story = self.daily_insights_for_month(
            "page_impressions_by_story_type", year, month)
        end_time, mention, other, fan, page_post, checkin = (
            list(),
            list(),
            list(),
            list(),
            list(),
            list(),
        )

        for item in by_story["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                mention.append(item["value"]["mention"])
            except KeyError:
                mention.append(0)
            try:
                other.append(item["value"]["other"])
            except KeyError:
                other.append(0)
            try:
                fan.append(item["value"]["fan"])
            except KeyError:
                fan.append(0)
            try:
                page_post.append(item["value"]["page post"])
            except KeyError:
                page_post.append(0)
            try:
                checkin.append(item["value"]["checkin"])
            except KeyError:
                checkin.append(0)

        df_page["impressions_by_story", "mention"] = pd.Series(data=mention,
                                                               index=end_time,
                                                               name="mention")
        df_page["impressions_by_story", "other"] = pd.Series(data=other,
                                                             index=end_time,
                                                             name="other")
        df_page["impressions_by_story", "fan"] = pd.Series(data=fan,
                                                           index=end_time,
                                                           name="fan")
        df_page["impressions_by_story",
                "page_post"] = pd.Series(data=page_post,
                                         index=end_time,
                                         name="page_post")
        df_page["impressions_by_story", "checkin"] = pd.Series(data=checkin,
                                                               index=end_time,
                                                               name="checkin")

        # Total Page Reach by user country. (Unique Users). Daily
        by_country = self.daily_insights_for_month(
            "page_impressions_by_country_unique", year, month)
        end_time = list()
        gb, us, ru, it = list(), list(), list(), list()

        for item in by_country["data"]:
            end_time.append(item[0]["values"]["end_time"][:10])
            try:
                gb.append(item[0]["values"]["value"]["GB"])
            except KeyError:
                gb.append(0)
            except IndexError:
                gb.append(0)
            try:
                us.append(item[0]["values"]["value"]["US"])
            except KeyError:
                us.append(0)
            except IndexError:
                us.append(0)
            try:
                ru.append(item[0]["values"]["value"]["RU"])
            except KeyError:
                ru.append(0)
            except IndexError:
                ru.append(0)

        df_page["impressions_by_country", "GB"] = pd.Series(data=gb,
                                                            index=end_time,
                                                            name="GB")
        df_page["impressions_by_country", "US"] = pd.Series(data=us,
                                                            index=end_time,
                                                            name="US")
        df_page["impressions_by_country", "RU"] = pd.Series(data=ru,
                                                            index=end_time,
                                                            name="RU")

        # Total Page Reach by age and gender. (Unique Users). Daily
        by_age_gender_unique = self.daily_insights_for_month(
            "page_impressions_by_age_gender_unique", year, month)
        end_time = list()
        f_13_17, f_18_24, f_25_34, f_35_44, f_45_54, f_55_64 = (
            list(),
            list(),
            list(),
            list(),
            list(),
            list(),
        )
        f_65_plus = list()
        m_13_17, m_18_24, m_25_34, m_35_44, m_45_54, m_55_64 = (
            list(),
            list(),
            list(),
            list(),
            list(),
            list(),
        )
        m_65_plus = list()

        for item in by_age_gender_unique["data"]:
            end_time.append(item[0]["values"]["end_time"][:10])
            try:
                f_13_17.append(item[0]["values"]["value"]["F.13-17"])
            except KeyError:
                f_13_17.append(0)
            except IndexError:
                f_13_17.append(0)
            try:
                f_18_24.append(item[0]["values"]["value"]["F.18-24"])
            except KeyError:
                f_18_24.append(0)
            except IndexError:
                f_18_24.append(0)
            try:
                f_25_34.append(item[0]["values"]["value"]["F.25-34"])
            except KeyError:
                f_25_34.append(0)
            except IndexError:
                f_25_34.append(0)
            try:
                f_35_44.append(item[0]["values"]["value"]["F.35-44"])
            except KeyError:
                f_35_44.append(0)
            except IndexError:
                f_35_44.append(0)
            try:
                f_45_54.append(item[0]["values"]["value"]["F.45-54"])
            except KeyError:
                f_45_54.append(0)
            except IndexError:
                f_45_54.append(0)
            try:
                f_55_64.append(item[0]["values"]["value"]["F.55-64"])
            except KeyError:
                f_55_64.append(0)
            except IndexError:
                f_55_64.append(0)
            try:
                f_65_plus.append(item[0]["values"]["value"]["F.65+"])
            except KeyError:
                f_65_plus.append(0)
            except IndexError:
                f_65_plus.append(0)
            try:
                m_13_17.append(item[0]["values"]["value"]["M.13-17"])
            except KeyError:
                m_13_17.append(0)
            except IndexError:
                m_13_17.append(0)
            try:
                m_18_24.append(item[0]["values"]["value"]["M.18-24"])
            except KeyError:
                m_18_24.append(0)
            except IndexError:
                m_18_24.append(0)
            try:
                m_25_34.append(item[0]["values"]["value"]["M.25-34"])
            except KeyError:
                m_25_34.append(0)
            except IndexError:
                m_25_34.append(0)
            try:
                m_35_44.append(item[0]["values"]["value"]["M.35-44"])
            except KeyError:
                m_35_44.append(0)
            except IndexError:
                m_35_44.append(0)
            try:
                m_45_54.append(item[0]["values"]["value"]["M.45-54"])
            except KeyError:
                m_45_54.append(0)
            except IndexError:
                m_45_54.append(0)
            try:
                m_55_64.append(item[0]["values"]["value"]["M.55-64"])
            except KeyError:
                m_55_64.append(0)
            except IndexError:
                m_55_64.append(0)
            try:
                m_65_plus.append(item[0]["values"]["value"]["M.65+"])
            except KeyError:
                m_65_plus.append(0)
            except IndexError:
                m_65_plus.append(0)

        df_page["impressions_by_age_gender_unique",
                "F.13-17"] = pd.Series(data=f_13_17,
                                       index=end_time,
                                       name="F.13-17")
        df_page["impressions_by_age_gender_unique",
                "F.18-24"] = pd.Series(data=f_18_24,
                                       index=end_time,
                                       name="F.18-24")
        df_page["impressions_by_age_gender_unique",
                "F.25-34"] = pd.Series(data=f_25_34,
                                       index=end_time,
                                       name="F.25-34")
        df_page["impressions_by_age_gender_unique",
                "F.35-44"] = pd.Series(data=f_35_44,
                                       index=end_time,
                                       name="F.35-44")
        df_page["impressions_by_age_gender_unique",
                "F.45-54"] = pd.Series(data=f_45_54,
                                       index=end_time,
                                       name="F.45-54")
        df_page["impressions_by_age_gender_unique",
                "F.55-64"] = pd.Series(data=f_55_64,
                                       index=end_time,
                                       name="F.55-64")
        df_page["impressions_by_age_gender_unique",
                "F.65+"] = pd.Series(data=f_65_plus,
                                     index=end_time,
                                     name="F.65+")
        df_page["impressions_by_age_gender_unique",
                "M.13-17"] = pd.Series(data=m_13_17,
                                       index=end_time,
                                       name="M.13-17")
        df_page["impressions_by_age_gender_unique",
                "M.18-24"] = pd.Series(data=m_18_24,
                                       index=end_time,
                                       name="M.18-24")
        df_page["impressions_by_age_gender_unique",
                "M.25-34"] = pd.Series(data=m_25_34,
                                       index=end_time,
                                       name="M.25-34")
        df_page["impressions_by_age_gender_unique",
                "M.35-44"] = pd.Series(data=m_35_44,
                                       index=end_time,
                                       name="M.35-44")
        df_page["impressions_by_age_gender_unique",
                "M.45-54"] = pd.Series(data=m_45_54,
                                       index=end_time,
                                       name="M.45-54")
        df_page["impressions_by_age_gender_unique",
                "M.55-64"] = pd.Series(data=m_55_64,
                                       index=end_time,
                                       name="M.55-64")
        df_page["impressions_by_age_gender_unique",
                "M.65+"] = pd.Series(data=m_65_plus,
                                     index=end_time,
                                     name="M.65+")

        return df_page
    def read_daily_demographics_insights_into_df(self, year, month):
        df_page = pd.DataFrame(
            columns=pd.MultiIndex(levels=[[], []], labels=[[], []]))

        # The total number of people who have liked your Page. (Unique Users). Lifetime
        fans = self.daily_insights_for_month("page_fans", year, month)
        end_time, value = list(), list()

        for item in fans["data"]:
            end_time.append(item[0]["values"]["end_time"][:10])
            try:
                value.append(item[0]["values"]["value"])
            except KeyError:
                value.append(0)
            except IndexError:
                value.append(0)

        df_page["fans", "value"] = pd.Series(data=value,
                                             index=end_time,
                                             name="value")

        # The number of people who liked your Page, broken down by the most common places where people can like
        # your Page.(Unique Users). Daily
        fans_by_like_source_unique = self.daily_insights_for_month(
            "page_fans_by_like_source_unique", year, month)
        end_time, news_feed, other = list(), list(), list()
        page_suggestions, restored_likes, search, your_page = (
            list(),
            list(),
            list(),
            list(),
        )

        for item in fans_by_like_source_unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                news_feed.append(item["value"]["News Feed"])
            except KeyError:
                news_feed.append(0)
            try:
                other.append(item["value"]["Other"])
            except KeyError:
                other.append(0)
            try:
                page_suggestions.append(item["value"]["Page Suggestions"])
            except KeyError:
                page_suggestions.append(0)
            try:
                restored_likes.append(
                    item["value"]["Restored Likes from Reactivated Accounts"])
            except KeyError:
                restored_likes.append(0)
            try:
                search.append(item["value"]["Search"])
            except KeyError:
                search.append(0)
            try:
                your_page.append(item["value"]["Your Page"])
            except KeyError:
                your_page.append(0)

        df_page["fans_by_like_source_unique",
                "news_feed"] = pd.Series(data=news_feed,
                                         index=end_time,
                                         name="news_feed")
        df_page["fans_by_like_source_unique",
                "other"] = pd.Series(data=other, index=end_time, name="other")
        df_page["fans_by_like_source_unique",
                "page_suggestions"] = pd.Series(data=page_suggestions,
                                                index=end_time,
                                                name="page_suggestions")
        df_page["fans_by_like_source_unique",
                "restored_likes"] = pd.Series(data=restored_likes,
                                              index=end_time,
                                              name="restored_likes")
        df_page["fans_by_like_source_unique",
                "search"] = pd.Series(data=search,
                                      index=end_time,
                                      name="search")
        df_page["fans_by_like_source_unique",
                "your_page"] = pd.Series(data=your_page,
                                         index=end_time,
                                         name="your_page")

        # This is a breakdown of the number of Page likes from the most common places where people can like your Page.
        # (Total Count). Daily
        fans_by_like_source = self.daily_insights_for_month(
            "page_fans_by_like_source", year, month)
        end_time, news_feed, other = list(), list(), list()
        page_suggestions, restored_likes, search, your_page = (
            list(),
            list(),
            list(),
            list(),
        )

        for item in fans_by_like_source["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                news_feed.append(item["value"]["News Feed"])
            except KeyError:
                news_feed.append(0)
            try:
                other.append(item["value"]["Other"])
            except KeyError:
                other.append(0)
            try:
                page_suggestions.append(item["value"]["Page Suggestions"])
            except KeyError:
                page_suggestions.append(0)
            try:
                restored_likes.append(
                    item["value"]["Restored Likes from Reactivated Accounts"])
            except KeyError:
                restored_likes.append(0)
            try:
                search.append(item["value"]["Search"])
            except KeyError:
                search.append(0)
            try:
                your_page.append(item["value"]["Your Page"])
            except KeyError:
                your_page.append(0)

        df_page["fans_by_like_source",
                "news_feed"] = pd.Series(data=news_feed,
                                         index=end_time,
                                         name="news_feed")
        df_page["fans_by_like_source", "other"] = pd.Series(data=other,
                                                            index=end_time,
                                                            name="other")
        df_page["fans_by_like_source",
                "page_suggestions"] = pd.Series(data=page_suggestions,
                                                index=end_time,
                                                name="page_suggestions")
        df_page["fans_by_like_source",
                "restored_likes"] = pd.Series(data=restored_likes,
                                              index=end_time,
                                              name="restored_likes")
        df_page["fans_by_like_source", "search"] = pd.Series(data=search,
                                                             index=end_time,
                                                             name="search")
        df_page["fans_by_like_source",
                "your_page"] = pd.Series(data=your_page,
                                         index=end_time,
                                         name="your_page")

        return df_page
Esempio n. 10
0
def test_non_cython_api():

    # GH5610
    # non-cython calls should not include the grouper

    df = DataFrame(
        [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"]
    )
    g = df.groupby("A")
    gni = df.groupby("A", as_index=False)

    # mad
    expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3])
    expected.index.name = "A"
    result = g.mad()
    tm.assert_frame_equal(result, expected)

    expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
    result = gni.mad()
    tm.assert_frame_equal(result, expected)

    # describe
    expected_index = Index([1, 3], name="A")
    expected_col = pd.MultiIndex(
        levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
        codes=[[0] * 8, list(range(8))],
    )
    expected = DataFrame(
        [
            [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
            [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
        ],
        index=expected_index,
        columns=expected_col,
    )
    result = g.describe()
    tm.assert_frame_equal(result, expected)

    expected = pd.concat(
        [
            df[df.A == 1].describe().unstack().to_frame().T,
            df[df.A == 3].describe().unstack().to_frame().T,
        ]
    )
    expected.index = Index([0, 1])
    result = gni.describe()
    tm.assert_frame_equal(result, expected)

    # any
    expected = DataFrame(
        [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
    )
    expected.index.name = "A"
    result = g.any()
    tm.assert_frame_equal(result, expected)

    # idxmax
    expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3])
    expected.index.name = "A"
    result = g.idxmax()
    tm.assert_frame_equal(result, expected)
Esempio n. 11
0
class TestToCSV:
    def test_to_csv_with_single_column(self):
        # see gh-18676, https://bugs.python.org/issue32255
        #
        # Python's CSV library adds an extraneous '""'
        # before the newline when the NaN-value is in
        # the first row. Otherwise, only the newline
        # character is added. This behavior is inconsistent
        # and was patched in https://bugs.python.org/pull_request4672.
        df1 = DataFrame([None, 1])
        expected1 = """\
""
1.0
"""
        with tm.ensure_clean("test.csv") as path:
            df1.to_csv(path, header=None, index=None)
            with open(path) as f:
                assert f.read() == expected1

        df2 = DataFrame([1, None])
        expected2 = """\
1.0
""
"""
        with tm.ensure_clean("test.csv") as path:
            df2.to_csv(path, header=None, index=None)
            with open(path) as f:
                assert f.read() == expected2

    def test_to_csv_defualt_encoding(self):
        # GH17097
        df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})

        with tm.ensure_clean("test.csv") as path:
            # the default to_csv encoding is uft-8.
            df.to_csv(path)
            tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)

    def test_to_csv_quotechar(self):
        df = DataFrame({"col": [1, 2]})
        expected = """\
"","col"
"0","1"
"1","2"
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1)  # 1=QUOTE_ALL
            with open(path) as f:
                assert f.read() == expected

        expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1, quotechar="$")
            with open(path) as f:
                assert f.read() == expected

        with tm.ensure_clean("test.csv") as path:
            with pytest.raises(TypeError, match="quotechar"):
                df.to_csv(path, quoting=1, quotechar=None)

    def test_to_csv_doublequote(self):
        df = DataFrame({"col": ['a"a', '"bb"']})
        expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1, doublequote=True)  # QUOTE_ALL
            with open(path) as f:
                assert f.read() == expected

        from _csv import Error

        with tm.ensure_clean("test.csv") as path:
            with pytest.raises(Error, match="escapechar"):
                df.to_csv(path, doublequote=False)  # no escapechar set

    def test_to_csv_escapechar(self):
        df = DataFrame({"col": ['a"a', '"bb"']})
        expected = """\
"","col"
"0","a\\"a"
"1","\\"bb\\""
"""

        with tm.ensure_clean("test.csv") as path:  # QUOTE_ALL
            df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
            with open(path) as f:
                assert f.read() == expected

        df = DataFrame({"col": ["a,a", ",bb,"]})
        expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=3, escapechar="\\")  # QUOTE_NONE
            with open(path) as f:
                assert f.read() == expected

    def test_csv_to_string(self):
        df = DataFrame({"col": [1, 2]})
        expected_rows = [",col", "0,1", "1,2"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected

    def test_to_csv_decimal(self):
        # see gh-781
        df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})

        expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
        expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected_default

        expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
        expected_european_excel = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(decimal=",", sep=";") == expected_european_excel

        expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
        expected_float_format_default = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(float_format="%.2f") == expected_float_format_default

        expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
        expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
        assert (df.to_csv(decimal=",", sep=";",
                          float_format="%.2f") == expected_float_format)

        # see gh-11553: testing if decimal is taken into account for '0.0'
        df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})

        expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(index=False, decimal="^") == expected

        # same but for an index
        assert df.set_index("a").to_csv(decimal="^") == expected

        # same for a multi-index
        assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected

    def test_to_csv_float_format(self):
        # testing if float_format is taken into account for the index
        # GH 11553
        df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})

        expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.set_index("a").to_csv(float_format="%.2f") == expected

        # same for a multi-index
        assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected

    def test_to_csv_na_rep(self):
        # see gh-11553
        #
        # Testing if NaN values are correctly represented in the index.
        df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        # now with an index containing only NaNs
        df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        # check if na_rep parameter does not break anything when no NaN
        df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
        expected = tm.convert_rows_list_to_csv_str(
            [",0", "0,a", "1,ZZZZZ", "2,c"])
        assert expected == csv

    def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype):
        # GH 29975
        # Make sure full na_rep shows up when a dtype is provided
        expected = tm.convert_rows_list_to_csv_str(
            [",0", "0,a", "1,ZZZZZ", "2,c"])
        csv = pd.Series(["a", pd.NA, "c"],
                        dtype=nullable_string_dtype).to_csv(na_rep="ZZZZZ")
        assert expected == csv

    def test_to_csv_date_format(self):
        # GH 10209
        df_sec = DataFrame(
            {"A": pd.date_range("20130101", periods=5, freq="s")})
        df_day = DataFrame(
            {"A": pd.date_range("20130101", periods=5, freq="d")})

        expected_rows = [
            ",A",
            "0,2013-01-01 00:00:00",
            "1,2013-01-01 00:00:01",
            "2,2013-01-01 00:00:02",
            "3,2013-01-01 00:00:03",
            "4,2013-01-01 00:00:04",
        ]
        expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv() == expected_default_sec

        expected_rows = [
            ",A",
            "0,2013-01-01 00:00:00",
            "1,2013-01-02 00:00:00",
            "2,2013-01-03 00:00:00",
            "3,2013-01-04 00:00:00",
            "4,2013-01-05 00:00:00",
        ]
        expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_day.to_csv(
            date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day

        expected_rows = [
            ",A",
            "0,2013-01-01",
            "1,2013-01-01",
            "2,2013-01-01",
            "3,2013-01-01",
            "4,2013-01-01",
        ]
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec

        expected_rows = [
            ",A",
            "0,2013-01-01",
            "1,2013-01-02",
            "2,2013-01-03",
            "3,2013-01-04",
            "4,2013-01-05",
        ]
        expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_day.to_csv() == expected_default_day
        assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day

        # see gh-7791
        #
        # Testing if date_format parameter is taken into account
        # for multi-indexed DataFrames.
        df_sec["B"] = 0
        df_sec["C"] = 1

        expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)

        df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
        assert df_sec_grouped.mean().to_csv(
            date_format="%Y-%m-%d") == expected_ymd_sec

    def test_to_csv_different_datetime_formats(self):
        # GH#21734
        df = DataFrame({
            "date":
            pd.to_datetime("1970-01-01"),
            "datetime":
            pd.date_range("1970-01-01", periods=2, freq="H"),
        })
        expected_rows = [
            "date,datetime",
            "1970-01-01,1970-01-01 00:00:00",
            "1970-01-01,1970-01-01 01:00:00",
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(index=False) == expected

    def test_to_csv_date_format_in_categorical(self):
        # GH#40754
        ser = pd.Series(
            pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
        ser = ser.astype("category")
        expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
        assert ser.to_csv(index=False) == expected

        ser = pd.Series(
            pd.date_range(start="2021-03-27",
                          freq="D",
                          periods=1,
                          tz="Europe/Berlin").append(pd.DatetimeIndex([pd.NaT
                                                                       ])))
        ser = ser.astype("category")
        assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected

    def test_to_csv_multi_index(self):
        # see gh-6618
        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [",1", ",2", "0,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["1", "2", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame(
            [1],
            columns=pd.MultiIndex.from_arrays([[1], [2]]),
            index=pd.MultiIndex.from_arrays([[1], [2]]),
        )

        exp_rows = [",,1", ",,2", "1,2,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["1", "2", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame([1],
                       columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))

        exp_rows = [",foo", ",bar", "0,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["foo", "bar", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

    @pytest.mark.parametrize(
        "ind,expected",
        [
            (
                pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
                "x,data\n1.0,1\n",
            ),
            (
                pd.MultiIndex(
                    levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]),
                "x,y,data\n1.0,2.0,1\n",
            ),
        ],
    )
    @pytest.mark.parametrize("klass", [DataFrame, pd.Series])
    def test_to_csv_single_level_multi_index(self, ind, expected, klass):
        # see gh-19589
        obj = klass(pd.Series([1], ind, name="data"))

        with tm.assert_produces_warning(FutureWarning, match="lineterminator"):
            # GH#9568 standardize on lineterminator matching stdlib
            result = obj.to_csv(line_terminator="\n", header=True)
        assert result == expected

    def test_to_csv_string_array_ascii(self):
        # GH 10813
        str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
        df = DataFrame(str_array)
        expected_ascii = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
        with tm.ensure_clean("str_test.csv") as path:
            df.to_csv(path, encoding="ascii")
            with open(path) as f:
                assert f.read() == expected_ascii

    def test_to_csv_string_array_utf8(self):
        # GH 10813
        str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
        df = DataFrame(str_array)
        expected_utf8 = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
        with tm.ensure_clean("unicode_test.csv") as path:
            df.to_csv(path, encoding="utf-8")
            with open(path) as f:
                assert f.read() == expected_utf8

    def test_to_csv_string_with_lf(self):
        # GH 20353
        data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
        df = DataFrame(data)
        with tm.ensure_clean("lf_test.csv") as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode("utf-8")
            expected_noarg = (b"int,str_lf" + os_linesep + b"1,abc" +
                              os_linesep + b'2,"d\nef"' + os_linesep +
                              b'3,"g\nh\n\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean("lf_test.csv") as path:
            # case 2: LF as line terminator
            expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
            df.to_csv(path, lineterminator="\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_lf
        with tm.ensure_clean("lf_test.csv") as path:
            # case 3: CRLF as line terminator
            # 'lineterminator' should not change inner element
            expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
            df.to_csv(path, lineterminator="\r\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_crlf

    def test_to_csv_string_with_crlf(self):
        # GH 20353
        data = {
            "int": [1, 2, 3],
            "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]
        }
        df = DataFrame(data)
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode("utf-8")
            expected_noarg = (b"int,str_crlf" + os_linesep + b"1,abc" +
                              os_linesep + b'2,"d\r\nef"' + os_linesep +
                              b'3,"g\r\nh\r\n\r\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 2: LF as line terminator
            expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
            df.to_csv(path, lineterminator="\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_lf
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 3: CRLF as line terminator
            # 'lineterminator' should not change inner element
            expected_crlf = (b"int,str_crlf\r\n"
                             b"1,abc\r\n"
                             b'2,"d\r\nef"\r\n'
                             b'3,"g\r\nh\r\n\r\ni"\r\n')
            df.to_csv(path, lineterminator="\r\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_crlf

    def test_to_csv_stdout_file(self, capsys):
        # GH 21561
        df = DataFrame([["foo", "bar"], ["baz", "qux"]],
                       columns=["name_1", "name_2"])
        expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
        expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)

        df.to_csv(sys.stdout, encoding="ascii")
        captured = capsys.readouterr()

        assert captured.out == expected_ascii
        assert not sys.stdout.closed

    @pytest.mark.xfail(
        compat.is_platform_windows(),
        reason=("Especially in Windows, file stream should not be passed"
                "to csv writer without newline='' option."
                "(https://docs.python.org/3.6/library/csv.html#csv.writer)"),
    )
    def test_to_csv_write_to_open_file(self):
        # GH 21696
        df = DataFrame({"a": ["x", "y", "z"]})
        expected = """\
manual header
x
y
z
"""
        with tm.ensure_clean("test.txt") as path:
            with open(path, "w") as f:
                f.write("manual header\n")
                df.to_csv(f, header=None, index=None)
            with open(path) as f:
                assert f.read() == expected

    def test_to_csv_write_to_open_file_with_newline_py3(self):
        # see gh-21696
        # see gh-20353
        df = DataFrame({"a": ["x", "y", "z"]})
        expected_rows = ["x", "y", "z"]
        expected = "manual header\n" + tm.convert_rows_list_to_csv_str(
            expected_rows)
        with tm.ensure_clean("test.txt") as path:
            with open(path, "w", newline="") as f:
                f.write("manual header\n")
                df.to_csv(f, header=None, index=None)

            with open(path, "rb") as f:
                assert f.read() == bytes(expected, "utf-8")

    @pytest.mark.parametrize("to_infer", [True, False])
    @pytest.mark.parametrize("read_infer", [True, False])
    def test_to_csv_compression(self, compression_only, read_infer, to_infer):
        # see gh-15008
        compression = compression_only

        # We'll complete file extension subsequently.
        filename = "test."
        filename += icom._compression_to_extension[compression]

        df = DataFrame({"A": [1]})

        to_compression = "infer" if to_infer else compression
        read_compression = "infer" if read_infer else compression

        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression=to_compression)
            result = pd.read_csv(path,
                                 index_col=0,
                                 compression=read_compression)
            tm.assert_frame_equal(result, df)

    def test_to_csv_compression_dict(self, compression_only):
        # GH 26023
        method = compression_only
        df = DataFrame({"ABC": [1]})
        filename = "to_csv_compress_as_dict."
        extension = {
            "gzip": "gz",
            "zstd": "zst",
        }.get(method, method)
        filename += extension
        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression={"method": method})
            read_df = pd.read_csv(path, index_col=0)
            tm.assert_frame_equal(read_df, df)

    def test_to_csv_compression_dict_no_method_raises(self):
        # GH 26023
        df = DataFrame({"ABC": [1]})
        compression = {"some_option": True}
        msg = "must have key 'method'"

        with tm.ensure_clean("out.zip") as path:
            with pytest.raises(ValueError, match=msg):
                df.to_csv(path, compression=compression)

    @pytest.mark.parametrize("compression", ["zip", "infer"])
    @pytest.mark.parametrize("archive_name",
                             ["test_to_csv.csv", "test_to_csv.zip"])
    def test_to_csv_zip_arguments(self, compression, archive_name):
        # GH 26023
        df = DataFrame({"ABC": [1]})
        with tm.ensure_clean("to_csv_archive_name.zip") as path:
            df.to_csv(path,
                      compression={
                          "method": compression,
                          "archive_name": archive_name
                      })
            with ZipFile(path) as zp:
                assert len(zp.filelist) == 1
                archived_file = zp.filelist[0].filename
                assert archived_file == archive_name

    @pytest.mark.parametrize(
        "filename,expected_arcname",
        [
            ("archive.csv", "archive.csv"),
            ("archive.tsv", "archive.tsv"),
            ("archive.csv.zip", "archive.csv"),
            ("archive.tsv.zip", "archive.tsv"),
            ("archive.zip", "archive"),
        ],
    )
    def test_to_csv_zip_infer_name(self, filename, expected_arcname):
        # GH 39465
        df = DataFrame({"ABC": [1]})
        with tm.ensure_clean_dir() as dir:
            path = Path(dir, filename)
            df.to_csv(path, compression="zip")
            with ZipFile(path) as zp:
                assert len(zp.filelist) == 1
                archived_file = zp.filelist[0].filename
                assert archived_file == expected_arcname

    @pytest.mark.parametrize("df_new_type", ["Int64"])
    def test_to_csv_na_rep_long_string(self, df_new_type):
        # see gh-25099
        df = DataFrame({"c": [float("nan")] * 3})
        df = df.astype(df_new_type)
        expected_rows = ["c", "mynull", "mynull", "mynull"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")

        assert expected == result

    def test_to_csv_timedelta_precision(self):
        # GH 6783
        s = pd.Series([1, 1]).astype("timedelta64[ns]")
        buf = io.StringIO()
        s.to_csv(buf)
        result = buf.getvalue()
        expected_rows = [
            ",0",
            "0,0 days 00:00:00.000000001",
            "1,0 days 00:00:00.000000001",
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert result == expected

    def test_na_rep_truncated(self):
        # https://github.com/pandas-dev/pandas/issues/31447
        result = pd.Series(range(8, 12)).to_csv(na_rep="-")
        expected = tm.convert_rows_list_to_csv_str(
            [",0", "0,8", "1,9", "2,10", "3,11"])
        assert result == expected

        result = pd.Series([True, False]).to_csv(na_rep="nan")
        expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
        assert result == expected

        result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
        expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
        assert result == expected

    @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
    def test_to_csv_errors(self, errors):
        # GH 22610
        data = ["\ud800foo"]
        ser = pd.Series(data, index=pd.Index(data))
        with tm.ensure_clean("test.csv") as path:
            ser.to_csv(path, errors=errors)
        # No use in reading back the data as it is not the same anymore
        # due to the error handling

    @pytest.mark.parametrize("mode", ["wb", "w"])
    def test_to_csv_binary_handle(self, mode):
        """
        Binary file objects should work (if 'mode' contains a 'b') or even without
        it in most cases.

        GH 35058 and GH 19827
        """
        df = tm.makeDataFrame()
        with tm.ensure_clean() as path:
            with open(path, mode="w+b") as handle:
                df.to_csv(handle, mode=mode)
            tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))

    @pytest.mark.parametrize("mode", ["wb", "w"])
    def test_to_csv_encoding_binary_handle(self, mode):
        """
        Binary file objects should honor a specified encoding.

        GH 23854 and GH 13068 with binary handles
        """
        # example from GH 23854
        content = "a, b, 🐟".encode("utf-8-sig")
        buffer = io.BytesIO(content)
        df = pd.read_csv(buffer, encoding="utf-8-sig")

        buffer = io.BytesIO()
        df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False)
        buffer.seek(0)  # tests whether file handle wasn't closed
        assert buffer.getvalue().startswith(content)

        # example from GH 13068
        with tm.ensure_clean() as path:
            with open(path, "w+b") as handle:
                DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig")

                handle.seek(0)
                assert handle.read().startswith(b'\xef\xbb\xbf""')
class TestToCSV(object):
    @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5),
                       reason=("Python csv library bug "
                               "(see https://bugs.python.org/issue32255)"))
    def test_to_csv_with_single_column(self):
        # see gh-18676, https://bugs.python.org/issue32255
        #
        # Python's CSV library adds an extraneous '""'
        # before the newline when the NaN-value is in
        # the first row. Otherwise, only the newline
        # character is added. This behavior is inconsistent
        # and was patched in https://bugs.python.org/pull_request4672.
        df1 = DataFrame([None, 1])
        expected1 = """\
""
1.0
"""
        with tm.ensure_clean('test.csv') as path:
            df1.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected1

        df2 = DataFrame([1, None])
        expected2 = """\
1.0
""
"""
        with tm.ensure_clean('test.csv') as path:
            df2.to_csv(path, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected2

    def test_to_csv_defualt_encoding(self):
        # GH17097
        df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})

        with tm.ensure_clean('test.csv') as path:
            # the default to_csv encoding in Python 2 is ascii, and that in
            # Python 3 is uft-8.
            if pd.compat.PY2:
                # the encoding argument parameter should be utf-8
                with pytest.raises(UnicodeEncodeError, match='ascii'):
                    df.to_csv(path)
            else:
                df.to_csv(path)
                tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)

    def test_to_csv_quotechar(self):
        df = DataFrame({'col': [1, 2]})
        expected = """\
"","col"
"0","1"
"1","2"
"""

        with tm.ensure_clean('test.csv') as path:
            df.to_csv(path, quoting=1)  # 1=QUOTE_ALL
            with open(path, 'r') as f:
                assert f.read() == expected

        expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""

        with tm.ensure_clean('test.csv') as path:
            df.to_csv(path, quoting=1, quotechar="$")
            with open(path, 'r') as f:
                assert f.read() == expected

        with tm.ensure_clean('test.csv') as path:
            with pytest.raises(TypeError, match='quotechar'):
                df.to_csv(path, quoting=1, quotechar=None)

    def test_to_csv_doublequote(self):
        df = DataFrame({'col': ['a"a', '"bb"']})
        expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''

        with tm.ensure_clean('test.csv') as path:
            df.to_csv(path, quoting=1, doublequote=True)  # QUOTE_ALL
            with open(path, 'r') as f:
                assert f.read() == expected

        from _csv import Error
        with tm.ensure_clean('test.csv') as path:
            with pytest.raises(Error, match='escapechar'):
                df.to_csv(path, doublequote=False)  # no escapechar set

    def test_to_csv_escapechar(self):
        df = DataFrame({'col': ['a"a', '"bb"']})
        expected = '''\
"","col"
"0","a\\"a"
"1","\\"bb\\""
'''

        with tm.ensure_clean('test.csv') as path:  # QUOTE_ALL
            df.to_csv(path, quoting=1, doublequote=False, escapechar='\\')
            with open(path, 'r') as f:
                assert f.read() == expected

        df = DataFrame({'col': ['a,a', ',bb,']})
        expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""

        with tm.ensure_clean('test.csv') as path:
            df.to_csv(path, quoting=3, escapechar='\\')  # QUOTE_NONE
            with open(path, 'r') as f:
                assert f.read() == expected

    def test_csv_to_string(self):
        df = DataFrame({'col': [1, 2]})
        expected_rows = [',col', '0,1', '1,2']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected

    def test_to_csv_decimal(self):
        # see gh-781
        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})

        expected_rows = [',col1,col2,col3', '0,1,a,10.1']
        expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected_default

        expected_rows = [';col1;col2;col3', '0;1;a;10,1']
        expected_european_excel = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(decimal=',', sep=';') == expected_european_excel

        expected_rows = [',col1,col2,col3', '0,1,a,10.10']
        expected_float_format_default = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(float_format='%.2f') == expected_float_format_default

        expected_rows = [';col1;col2;col3', '0;1;a;10,10']
        expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(decimal=',', sep=';',
                         float_format='%.2f') == expected_float_format

        # see gh-11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})

        expected_rows = ['a,b,c', '0^0,2^2,1', '1^1,3^3,1']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(index=False, decimal='^') == expected

        # same but for an index
        assert df.set_index('a').to_csv(decimal='^') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected

    def test_to_csv_float_format(self):
        # testing if float_format is taken into account for the index
        # GH 11553
        df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1})

        expected_rows = ['a,b,c', '0,2.20,1', '1,3.30,1']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.set_index('a').to_csv(float_format='%.2f') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(float_format='%.2f') == expected

    def test_to_csv_na_rep(self):
        # see gh-11553
        #
        # Testing if NaN values are correctly represented in the index.
        df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c', '0.0,0,2', '_,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected

        # now with an index containing only NaNs
        df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c', '_,0,2', '_,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected

        # check if na_rep parameter does not break anything when no NaN
        df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c', '0,0,2', '0,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected

    def test_to_csv_date_format(self):
        # GH 10209
        df_sec = DataFrame(
            {'A': pd.date_range('20130101', periods=5, freq='s')})
        df_day = DataFrame(
            {'A': pd.date_range('20130101', periods=5, freq='d')})

        expected_rows = [
            ',A', '0,2013-01-01 00:00:00', '1,2013-01-01 00:00:01',
            '2,2013-01-01 00:00:02', '3,2013-01-01 00:00:03',
            '4,2013-01-01 00:00:04'
        ]
        expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv() == expected_default_sec

        expected_rows = [
            ',A', '0,2013-01-01 00:00:00', '1,2013-01-02 00:00:00',
            '2,2013-01-03 00:00:00', '3,2013-01-04 00:00:00',
            '4,2013-01-05 00:00:00'
        ]
        expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert (df_day.to_csv(
            date_format='%Y-%m-%d %H:%M:%S') == expected_ymdhms_day)

        expected_rows = [
            ',A', '0,2013-01-01', '1,2013-01-01', '2,2013-01-01',
            '3,2013-01-01', '4,2013-01-01'
        ]
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec

        expected_rows = [
            ',A', '0,2013-01-01', '1,2013-01-02', '2,2013-01-03',
            '3,2013-01-04', '4,2013-01-05'
        ]
        expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_day.to_csv() == expected_default_day
        assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day

        # see gh-7791
        #
        # Testing if date_format parameter is taken into account
        # for multi-indexed DataFrames.
        df_sec['B'] = 0
        df_sec['C'] = 1

        expected_rows = ['A,B,C', '2013-01-01,0,1']
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)

        df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B'])
        assert (df_sec_grouped.mean().to_csv(
            date_format='%Y-%m-%d') == expected_ymd_sec)

    def test_to_csv_multi_index(self):
        # see gh-6618
        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [',1', ',2', '0,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['1', '2', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame([1],
                       columns=pd.MultiIndex.from_arrays([[1], [2]]),
                       index=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [',,1', ',,2', '1,2,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['1', '2', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame([1],
                       columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))

        exp_rows = [',foo', ',bar', '0,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['foo', 'bar', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

    @pytest.mark.parametrize("ind,expected", [
        (pd.MultiIndex(levels=[[1.0]], codes=[[0]],
                       names=["x"]), "x,data\n1.0,1\n"),
        (pd.MultiIndex(levels=[[1.], [2.]], codes=[[0], [0]],
                       names=["x", "y"]), "x,y,data\n1.0,2.0,1\n")
    ])
    @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series])
    def test_to_csv_single_level_multi_index(self, ind, expected, klass):
        # see gh-19589
        result = klass(pd.Series([1], ind,
                                 name="data")).to_csv(line_terminator="\n",
                                                      header=True)
        assert result == expected

    def test_to_csv_string_array_ascii(self):
        # GH 10813
        str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
        df = pd.DataFrame(str_array)
        expected_ascii = '''\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
'''
        with tm.ensure_clean('str_test.csv') as path:
            df.to_csv(path, encoding='ascii')
            with open(path, 'r') as f:
                assert f.read() == expected_ascii

    @pytest.mark.xfail
    def test_to_csv_string_array_utf8(self):
        # GH 10813
        str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}]
        df = pd.DataFrame(str_array)
        expected_utf8 = '''\
,names
0,"[u'foo', u'bar']"
1,"[u'baz', u'qux']"
'''
        with tm.ensure_clean('unicode_test.csv') as path:
            df.to_csv(path, encoding='utf-8')
            with open(path, 'r') as f:
                assert f.read() == expected_utf8

    def test_to_csv_string_with_lf(self):
        # GH 20353
        data = {'int': [1, 2, 3], 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni']}
        df = pd.DataFrame(data)
        with tm.ensure_clean('lf_test.csv') as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode('utf-8')
            expected_noarg = (b'int,str_lf' + os_linesep + b'1,abc' +
                              os_linesep + b'2,"d\nef"' + os_linesep +
                              b'3,"g\nh\n\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean('lf_test.csv') as path:
            # case 2: LF as line terminator
            expected_lf = (b'int,str_lf\n'
                           b'1,abc\n'
                           b'2,"d\nef"\n'
                           b'3,"g\nh\n\ni"\n')
            df.to_csv(path, line_terminator='\n', index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_lf
        with tm.ensure_clean('lf_test.csv') as path:
            # case 3: CRLF as line terminator
            # 'line_terminator' should not change inner element
            expected_crlf = (b'int,str_lf\r\n'
                             b'1,abc\r\n'
                             b'2,"d\nef"\r\n'
                             b'3,"g\nh\n\ni"\r\n')
            df.to_csv(path, line_terminator='\r\n', index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_crlf

    def test_to_csv_string_with_crlf(self):
        # GH 20353
        data = {
            'int': [1, 2, 3],
            'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni']
        }
        df = pd.DataFrame(data)
        with tm.ensure_clean('crlf_test.csv') as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode('utf-8')
            expected_noarg = (b'int,str_crlf' + os_linesep + b'1,abc' +
                              os_linesep + b'2,"d\r\nef"' + os_linesep +
                              b'3,"g\r\nh\r\n\r\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean('crlf_test.csv') as path:
            # case 2: LF as line terminator
            expected_lf = (b'int,str_crlf\n'
                           b'1,abc\n'
                           b'2,"d\r\nef"\n'
                           b'3,"g\r\nh\r\n\r\ni"\n')
            df.to_csv(path, line_terminator='\n', index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_lf
        with tm.ensure_clean('crlf_test.csv') as path:
            # case 3: CRLF as line terminator
            # 'line_terminator' should not change inner element
            expected_crlf = (b'int,str_crlf\r\n'
                             b'1,abc\r\n'
                             b'2,"d\r\nef"\r\n'
                             b'3,"g\r\nh\r\n\r\ni"\r\n')
            df.to_csv(path, line_terminator='\r\n', index=False)
            with open(path, 'rb') as f:
                assert f.read() == expected_crlf

    def test_to_csv_stdout_file(self, capsys):
        # GH 21561
        df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']],
                          columns=['name_1', 'name_2'])
        expected_rows = [',name_1,name_2', '0,foo,bar', '1,baz,qux']
        expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)

        df.to_csv(sys.stdout, encoding='ascii')
        captured = capsys.readouterr()

        assert captured.out == expected_ascii
        assert not sys.stdout.closed

    @pytest.mark.xfail(
        compat.is_platform_windows(),
        reason=("Especially in Windows, file stream should not be passed"
                "to csv writer without newline='' option."
                "(https://docs.python.org/3.6/library/csv.html#csv.writer)"))
    def test_to_csv_write_to_open_file(self):
        # GH 21696
        df = pd.DataFrame({'a': ['x', 'y', 'z']})
        expected = '''\
manual header
x
y
z
'''
        with tm.ensure_clean('test.txt') as path:
            with open(path, 'w') as f:
                f.write('manual header\n')
                df.to_csv(f, header=None, index=None)
            with open(path, 'r') as f:
                assert f.read() == expected

    @pytest.mark.skipif(compat.PY2, reason="Test case for python3")
    def test_to_csv_write_to_open_file_with_newline_py3(self):
        # see gh-21696
        # see gh-20353
        df = pd.DataFrame({'a': ['x', 'y', 'z']})
        expected_rows = ["x", "y", "z"]
        expected = ("manual header\n" +
                    tm.convert_rows_list_to_csv_str(expected_rows))
        with tm.ensure_clean('test.txt') as path:
            with open(path, 'w', newline='') as f:
                f.write('manual header\n')
                df.to_csv(f, header=None, index=None)

            with open(path, 'rb') as f:
                assert f.read() == bytes(expected, 'utf-8')

    @pytest.mark.skipif(compat.PY3, reason="Test case for python2")
    def test_to_csv_write_to_open_file_with_newline_py2(self):
        # see gh-21696
        # see gh-20353
        df = pd.DataFrame({'a': ['x', 'y', 'z']})
        expected_rows = ["x", "y", "z"]
        expected = ("manual header\n" +
                    tm.convert_rows_list_to_csv_str(expected_rows))
        with tm.ensure_clean('test.txt') as path:
            with open(path, 'wb') as f:
                f.write('manual header\n')
                df.to_csv(f, header=None, index=None)

            with open(path, 'rb') as f:
                assert f.read() == expected

    @pytest.mark.parametrize("to_infer", [True, False])
    @pytest.mark.parametrize("read_infer", [True, False])
    def test_to_csv_compression(self, compression_only, read_infer, to_infer):
        # see gh-15008
        compression = compression_only

        if compression == "zip":
            pytest.skip("{compression} is not supported "
                        "for to_csv".format(compression=compression))

        # We'll complete file extension subsequently.
        filename = "test."

        if compression == "gzip":
            filename += "gz"
        else:
            # xz --> .xz
            # bz2 --> .bz2
            filename += compression

        df = DataFrame({"A": [1]})

        to_compression = "infer" if to_infer else compression
        read_compression = "infer" if read_infer else compression

        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression=to_compression)
            result = pd.read_csv(path,
                                 index_col=0,
                                 compression=read_compression)
            tm.assert_frame_equal(result, df)
Esempio n. 13
0
def table_model_regions(path, year=2014):
    table = pd.DataFrame(
        columns=pd.MultiIndex(levels=[[], []], codes=[[], []]))

    # table = pd.read_excel(
    #     os.path.join(path, 'kennzahlen_modellregionen' + '.xlsx'),
    #     index_col=[0], header=[0, 1])

    # inhabitants
    ew = inhabitants.get_ew_by_federal_states(year)
    ew_bln = friedrichshagen.calculate_inhabitants_districts(year)['EW'].sum()
    fhg_ew = friedrichshagen.calculate_inhabitants_friedrichshagen(year)
    ew_de01 = deflex.inhabitants.get_ew_by_deflex(2014, rmap='de21')['DE01']

    # electricity_demand
    fhg_elec = friedrichshagen.calculate_elec_demand_friedrichshagen(
        year).sum()
    bln_share = deflex.demand.openego_demand_share()['DE22']
    de01_share = deflex.demand.openego_demand_share()[['DE22', 'DE01']].sum()
    bln_usage = berlin_hp.electricity.get_electricity_demand(year).sum()[
        'usage']
    de_demand = bmwi.get_annual_electricity_demand_bmwi(2014) * 1000

    # heat demand
    bln_heat = berlin_hp.heat.create_heat_profiles(2014).sum().sum() / 1000
    fhg_heat = berlin_hp.heat.create_heat_profiles(
        2014, region=90517).sum().sum() / 1000
    heat_states = deflex.demand.get_heat_profiles_by_state(2014).groupby(
        level=0, axis=1).sum().sum().div(3.6)
    de01_heat = deflex.demand.get_heat_profiles_deflex(
        2014, separate_regions=['DE01'])['DE01'].sum().sum() / 1000

    sec = 'Bevölkerung'
    table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(ew['BE'])
    table.loc[sec, ('Berlin', 'absolut')] = ew_bln
    table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_ew)
    table.loc[sec, ('Deutschland', 'absolut')] = int(ew.sum())
    table.loc[sec, ('DE01 (de21)', 'absolut')] = int(ew_de01)

    sec = 'Strombedarf [GWh]'
    table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(bln_share * de_demand)
    table.loc[sec, ('Berlin', 'absolut')] = bln_usage
    table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_elec.sum())
    table.loc[sec, ('Deutschland', 'absolut')] = int(de_demand)
    table.loc[sec, ('DE01 (de21)', 'absolut')] = int(de01_share * de_demand)

    sec = 'Wärmebedarf [GWh]'
    table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(heat_states['BE'])
    table.loc[sec, ('Berlin', 'absolut')] = int(bln_heat)
    table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_heat)
    table.loc[sec, ('Deutschland', 'absolut')] = int(heat_states.sum())
    table.loc[sec, ('DE01 (de21)', 'absolut')] = int(de01_heat)

    for c in table.columns.get_level_values(0).unique():
        table[c, '%'] = round(table[c, 'absolut'].div(
                table['Deutschland', 'absolut']).multiply(100), 2)

    table = table[['Modellregion', 'Berlin', 'Berlin (deflex)', 'DE01 (de21)',
                   'Deutschland']]
    print(table)
    table.to_csv(os.path.join(path, 'kennzahlen_modellregionen' + '.csv'))
def delete_multiindex_for_missing_conditions(missing_combos, data_index):

    #This should be replaced with something like this:

    # data_index_df = data_index.to_frame()
    # data_index_df.drop(('WT','0.5x','TR1'), inplace=True)
    # data_index_df.drop(('WT','1x','TR2'), inplace=True)

    # #data_index_adjusted, inds_to_remove = plate_reader_tools.delete_multiindex_for_missing_conditions(missing_items, data_index)

    # data_index_adjusted = pd.MultiIndex.from_frame(data_index_df)

    #gives a new multiindex from a full multiindex with specified missing conditions.
    #missing items is a list of tuples.  The tuples are of the form:
    #(Level1,Item missing from level 1, Level2, Item missing from level 2, Level 3, Item missing from level 3, ...)
    #for instance if strain ABC is missing technical replicate 3, you would put:
    #("strain", "ABC", "tech_rep", "TR3")

    inds_to_remove = []
    for missing_combo in missing_combos:
        layers_involved = [
            missing_combo[jj]
            for jj in 2 * np.array(range(int(len(missing_combo) / 2)))
        ]
        layers_involved_inds = []
        for layer in layers_involved:
            layers_involved_inds.append([
                i for i, name in enumerate(data_index.names) if name == layer
            ][0])
        missing_layers = [
            missing_combo[jj]
            for jj in 2 * np.array(range(int(len(missing_combo) / 2))) + 1
        ]
        missing_layers_inds = []
        for jj, layer in enumerate(missing_layers):
            missing_layers_inds.append([
                i for i, name in enumerate(data_index.levels[
                    layers_involved_inds[jj]]) if name == layer
            ][0])

        combined_labels = zip(
            *[data_index.labels[jj] for jj in layers_involved_inds])

        test_label = tuple(missing_layers_inds)
        inds_to_remove_for_missing_combo = []
        for ii, label in enumerate(combined_labels):
            if label == test_label:
                inds_to_remove_for_missing_combo.append(ii)

        inds_to_remove_for_missing_combo
        inds_to_remove.append(inds_to_remove_for_missing_combo)

    #flatten out list of indices and remove duplicates.
    inds_to_remove = list(set(chain.from_iterable(inds_to_remove)))

    new_labels = [(np.delete(label_level, inds_to_remove))
                  for label_level in data_index.labels]
    data_index_adjusted = pd.MultiIndex(levels=data_index.levels,
                                        labels=new_labels,
                                        names=data_index.names)

    return data_index_adjusted, inds_to_remove
Esempio n. 15
0
    def add(self, data, header, row=None, subheader=None):
        """Filter `data` by arguments of this SummaryStats instance,
        then apply `pd.describe()` and format the statistics

        Parameters
        ----------
        data : pd.DataFrame or pd.Series
            data for which summary statistics should be computed
        header : str
            column name for descriptive statistics
        row : str
            row name for descriptive statistics
            (required if `pyam.Statistics(rows=True)`)
        subheader : str, optional
            column name (level=1) if data is a unnamed `pd.Series`
        """
        # verify validity of specifications
        if self.rows is not None and row is None:
            raise ValueError('row specification required')
        if self.rows is None and row is not None:
            raise ValueError('row arg illegal for this `Statistics` instance')
        if isinstance(data, pd.Series):
            if subheader is not None:
                data.name = subheader
            elif data.name is None:
                msg = '`data` must be named `pd.Series` or provide `subheader`'
                raise ValueError(msg)
            data = pd.DataFrame(data)

        if self.rows is not None and row not in self.rows:
            self.rows.append(row)

        _stats = None

        # describe with groupby feature
        if self.groupby is not None:
            filter_args = dict(data=data, df=self.df, join_meta=True)
            filter_args.update(self.groupby)
            _stats = (filter_by_meta(**filter_args).groupby(
                self.col).describe(percentiles=self.percentiles))
            _stats = pd.concat([_stats], keys=[self.col], names=[''], axis=0)
            if self.rows:
                _stats['row'] = row
                _stats.set_index('row', append=True, inplace=True)
            _stats.index.names = [''] * 3 if self.rows else [''] * 2

        # describe with filter feature
        for (idx, _filter) in self.filters:
            filter_args = dict(data=data, df=self.df)
            filter_args.update(_filter)
            _stats_f = (filter_by_meta(**filter_args).describe(
                percentiles=self.percentiles))
            _stats_f = pd.DataFrame(_stats_f.unstack()).T
            if self.idx_depth == 1:
                levels = [[idx]]
            else:
                levels = [[idx[0]], [idx[1]]]
            lvls, lbls = (levels, [[0]] * self.idx_depth) if not self.rows \
                else (levels + [[row]], [[0]] * (self.idx_depth + 1))
            _stats_f.index = pd.MultiIndex(levels=lvls, labels=lbls)
            _stats = _stats_f if _stats is None else _stats.append(_stats_f)

        # add header
        _stats = pd.concat([_stats], keys=[header], names=[''], axis=1)
        subheader = _stats.columns.get_level_values(1).unique()
        self._add_to_header(header, subheader)

        # set statistics
        if self.stats is None:
            self.stats = _stats
        else:
            self.stats = _stats.combine_first(self.stats)
Esempio n. 16
0
    def read_daily_engagement_insights_into_df(self, year, month):
        df_page = pd.DataFrame(
            columns=pd.MultiIndex(levels=[[], []], labels=[[], []]))

        # The number of people who engaged with your Page. Engagement includes any click or story created.
        # (Unique Users). Daily
        engaged_users = self.daily_insights_for_month("page_engaged_users",
                                                      year, month)
        end_time, value = list(), list()

        for item in engaged_users["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["engaged_users", "value"] = pd.Series(data=value,
                                                      index=end_time,
                                                      name="value")

        # The number of of people who clicked on any of your content, by type. Stories that are created without clicking
        # on Page content (ex, liking the Page from timeline) are not included. (Unique Users). Daily
        by_consumption_type_unique = self.daily_insights_for_month(
            "page_consumptions_by_consumption_type_unique", year, month)
        end_time, video_play, other_clicks, photo_view, link_clicks = (
            list(),
            list(),
            list(),
            list(),
            list(),
        )

        for item in by_consumption_type_unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                video_play.append(item["value"]["video play"])
            except KeyError:
                video_play.append(0)
            try:
                other_clicks.append(item["value"]["other clicks"])
            except KeyError:
                other_clicks.append(0)
            try:
                photo_view.append(item["value"]["photo view"])
            except KeyError:
                photo_view.append(0)
            try:
                link_clicks.append(item["value"]["link clicks"])
            except KeyError:
                link_clicks.append(0)

        df_page["consumptions_by_type_unique",
                "video_play"] = pd.Series(data=video_play,
                                          index=end_time,
                                          name="video_play")
        df_page["consumptions_by_type_unique",
                "other_clicks"] = pd.Series(data=other_clicks,
                                            index=end_time,
                                            name="other_clicks")
        df_page["consumptions_by_type_unique",
                "photo_view"] = pd.Series(data=photo_view,
                                          index=end_time,
                                          name="photo_view")
        df_page["consumptions_by_type_unique",
                "link_clicks"] = pd.Series(data=link_clicks,
                                           index=end_time,
                                           name="link_clicks")

        # The number of clicks on any of your content, by type. Stories generated without clicks on page content
        # (e.g., liking the page in Timeline) are not included. (Total Count). Daily
        by_consumption_type = self.daily_insights_for_month(
            "page_consumptions_by_consumption_type", year, month)
        end_time, video_play, other_clicks, photo_view, link_clicks = (
            list(),
            list(),
            list(),
            list(),
            list(),
        )

        for item in by_consumption_type["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                video_play.append(item["value"]["video play"])
            except KeyError:
                video_play.append(0)
            try:
                other_clicks.append(item["value"]["other clicks"])
            except KeyError:
                other_clicks.append(0)
            try:
                photo_view.append(item["value"]["photo view"])
            except KeyError:
                photo_view.append(0)
            try:
                link_clicks.append(item["value"]["link clicks"])
            except KeyError:
                link_clicks.append(0)

        df_page["consumptions_by_type",
                "video_play"] = pd.Series(data=video_play,
                                          index=end_time,
                                          name="video_play")
        df_page["consumptions_by_type",
                "other_clicks"] = pd.Series(data=other_clicks,
                                            index=end_time,
                                            name="other_clicks")
        df_page["consumptions_by_type",
                "photo_view"] = pd.Series(data=photo_view,
                                          index=end_time,
                                          name="photo_view")
        df_page["consumptions_by_type",
                "link_clicks"] = pd.Series(data=link_clicks,
                                           index=end_time,
                                           name="link_clicks")

        # Total check-ins at your Place (Unique Users). Daily
        places_checkin = self.daily_insights_for_month(
            "page_places_checkin_total_unique", year, month)
        end_time, value = list(), list()

        for item in places_checkin["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["places_checkin", "value"] = pd.Series(data=value,
                                                       index=end_time,
                                                       name="value")

        # The number of people who have given negative feedback to your Page, by type. (Unique Users). Daily
        negative_feedback_by_type_unique = self.daily_insights_for_month(
            "page_negative_feedback_by_type_unique", year, month)
        end_time, hide_all_clicks, hide_clicks, unlike_page_clicks = (
            list(),
            list(),
            list(),
            list(),
        )
        report_spam_clicks = list()

        for item in negative_feedback_by_type_unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                hide_all_clicks.append(item["value"]["hide_all_clicks"])
            except KeyError:
                hide_all_clicks.append(0)
            try:
                hide_clicks.append(item["value"]["hide_clicks"])
            except KeyError:
                hide_clicks.append(0)
            try:
                unlike_page_clicks.append(item["value"]["unlike_page_clicks"])
            except KeyError:
                unlike_page_clicks.append(0)
            try:
                report_spam_clicks.append(item["value"]["report_spam_clicks"])
            except KeyError:
                report_spam_clicks.append(0)

        df_page["negative_feedback_by_type_unique",
                "hide_all_clicks"] = pd.Series(data=hide_all_clicks,
                                               index=end_time,
                                               name="hide_all_clicks")
        df_page["negative_feedback_by_type_unique",
                "hide_clicks"] = pd.Series(data=hide_clicks,
                                           index=end_time,
                                           name="hide_clicks")
        df_page["negative_feedback_by_type_unique",
                "unlike_page_clicks"] = pd.Series(data=unlike_page_clicks,
                                                  index=end_time,
                                                  name="unlike_page_clicks")
        df_page["negative_feedback_by_type_unique",
                "report_spam_clicks"] = pd.Series(data=report_spam_clicks,
                                                  index=end_time,
                                                  name="report_spam_clicks")

        # The number of times people have given negative feedback to your Page, by type. (Total Count). Daily
        negative_feedback_by_type = self.daily_insights_for_month(
            "page_negative_feedback_by_type", year, month)
        end_time, hide_all_clicks, hide_clicks, unlike_page_clicks = (
            list(),
            list(),
            list(),
            list(),
        )
        report_spam_clicks = list()

        for item in negative_feedback_by_type["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                hide_all_clicks.append(item["value"]["hide_all_clicks"])
            except KeyError:
                hide_all_clicks.append(0)
            try:
                hide_clicks.append(item["value"]["hide_clicks"])
            except KeyError:
                hide_clicks.append(0)
            try:
                unlike_page_clicks.append(item["value"]["unlike_page_clicks"])
            except KeyError:
                unlike_page_clicks.append(0)
            try:
                report_spam_clicks.append(item["value"]["report_spam_clicks"])
            except KeyError:
                report_spam_clicks.append(0)

        df_page["negative_feedback_by_type",
                "hide_all_clicks"] = pd.Series(data=hide_all_clicks,
                                               index=end_time,
                                               name="hide_all_clicks")
        df_page["negative_feedback_by_type",
                "hide_clicks"] = pd.Series(data=hide_clicks,
                                           index=end_time,
                                           name="hide_clicks")
        df_page["negative_feedback_by_type",
                "unlike_page_clicks"] = pd.Series(data=unlike_page_clicks,
                                                  index=end_time,
                                                  name="unlike_page_clicks")
        df_page["negative_feedback_by_type",
                "report_spam_clicks"] = pd.Series(data=report_spam_clicks,
                                                  index=end_time,
                                                  name="report_spam_clicks")

        # The number of times people have given positive feedback to your Page, by type. (Unique Users). Daily
        positive_feedback_by_type_unique = self.daily_insights_for_month(
            "page_positive_feedback_by_type_unique", year, month)
        end_time, link, like, comment, other = list(), list(), list(), list(
        ), list()

        for item in positive_feedback_by_type_unique["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                link.append(item["value"]["link"])
            except KeyError:
                link.append(0)
            try:
                like.append(item["value"]["like"])
            except KeyError:
                like.append(0)
            try:
                comment.append(item["value"]["comment"])
            except KeyError:
                comment.append(0)
            try:
                other.append(item["value"]["other"])
            except KeyError:
                other.append(0)

        df_page["positive_feedback_by_type_unique",
                "link"] = pd.Series(data=link, index=end_time, name="link")
        df_page["positive_feedback_by_type_unique",
                "like"] = pd.Series(data=like, index=end_time, name="like")
        df_page["positive_feedback_by_type_unique",
                "comment"] = pd.Series(data=comment,
                                       index=end_time,
                                       name="comment")
        df_page["positive_feedback_by_type_unique",
                "other"] = pd.Series(data=other, index=end_time, name="other")

        # The number of times people have given positive feedback to your Page, by type. (Total Count). Daily
        positive_feedback_by_type = self.daily_insights_for_month(
            "page_positive_feedback_by_type", year, month)
        end_time, link, like, comment, other = list(), list(), list(), list(
        ), list()

        for item in positive_feedback_by_type["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                link.append(item["value"]["link"])
            except KeyError:
                link.append(0)
            try:
                like.append(item["value"]["like"])
            except KeyError:
                like.append(0)
            try:
                comment.append(item["value"]["comment"])
            except KeyError:
                comment.append(0)
            try:
                other.append(item["value"]["other"])
            except KeyError:
                other.append(0)

        df_page["positive_feedback_by_type",
                "link"] = pd.Series(data=link, index=end_time, name="link")
        df_page["positive_feedback_by_type",
                "like"] = pd.Series(data=like, index=end_time, name="like")
        df_page["positive_feedback_by_type",
                "comment"] = pd.Series(data=comment,
                                       index=end_time,
                                       name="comment")
        df_page["positive_feedback_by_type",
                "other"] = pd.Series(data=other, index=end_time, name="other")

        return df_page
import pandas as pd
import numpy as np

# multiindex rows
frame1 = pd.DataFrame(data=np.random.randint(0, high=10, size=(4, 2)), columns=['a', 'b'], index=pd.MultiIndex([['s', 'd'], [2, 3]], [[0, 0, 1, 1], [0, 1, 0, 1]]))
print(frame1) #line 6

# multiindex columns
frame2 = pd.DataFrame(np.random.random((4, 4)))
frame2.columns = pd.MultiIndex.from_product([[1, 2], [1, 'B']])
print(frame2) # line 11
Esempio n. 18
0
    def read_daily_reactions_insights_into_df(self, year, month):

        df_page = pd.DataFrame(
            columns=pd.MultiIndex(levels=[[], []], labels=[[], []]))

        # Total post like reactions of a page. Daily
        reactions_like_total = self.daily_insights_for_month(
            "page_actions_post_reactions_like_total", year, month)
        end_time, value = list(), list()

        for item in reactions_like_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_like", "value"] = pd.Series(data=value,
                                                       index=end_time,
                                                       name="value")

        # Total post love reactions of a page. Daily
        reactions_love_total = self.daily_insights_for_month(
            "page_actions_post_reactions_love_total", year, month)
        end_time, value = list(), list()

        for item in reactions_love_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_love", "value"] = pd.Series(data=value,
                                                       index=end_time,
                                                       name="value")

        # Total post wow reactions of a page. Daily
        reactions_wow_total = self.daily_insights_for_month(
            "page_actions_post_reactions_wow_total", year, month)
        end_time, value = list(), list()

        for item in reactions_wow_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_wow", "value"] = pd.Series(data=value,
                                                      index=end_time,
                                                      name="value")

        # Total post haha reactions of a page. Daily
        reactions_haha_total = self.daily_insights_for_month(
            "page_actions_post_reactions_haha_total", year, month)
        end_time, value = list(), list()

        for item in reactions_haha_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_haha", "value"] = pd.Series(data=value,
                                                       index=end_time,
                                                       name="value")

        # Total post sorry reactions of a page. Daily
        reactions_sorry_total = self.daily_insights_for_month(
            "page_actions_post_reactions_sorry_total", year, month)
        end_time, value = list(), list()

        for item in reactions_sorry_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_sorry", "value"] = pd.Series(data=value,
                                                        index=end_time,
                                                        name="value")

        # Total post anger reactions of a page. Daily
        reactions_anger_total = self.daily_insights_for_month(
            "page_actions_post_reactions_anger_total", year, month)
        end_time, value = list(), list()

        for item in reactions_anger_total["data"][0]["values"]:
            end_time.append(item["end_time"][:10])
            try:
                value.append(item["value"])
            except KeyError:
                value.append(0)

        df_page["reactions_anger", "value"] = pd.Series(data=value,
                                                        index=end_time,
                                                        name="value")

        return df_page
Esempio n. 19
0
def _nonempty_index(idx):
    typ = type(idx)
    if typ is pd.RangeIndex:
        return pd.RangeIndex(2, name=idx.name)
    elif typ in _numeric_index_types:
        return typ([1, 2], name=idx.name)
    elif typ is pd.Index:
        return pd.Index(["a", "b"], name=idx.name)
    elif typ is pd.DatetimeIndex:
        start = "1970-01-01"
        # Need a non-monotonic decreasing index to avoid issues with
        # partial string indexing see https://github.com/dask/dask/issues/2389
        # and https://github.com/pandas-dev/pandas/issues/16515
        # This doesn't mean `_meta_nonempty` should ever rely on
        # `self.monotonic_increasing` or `self.monotonic_decreasing`
        try:
            return pd.date_range(start=start,
                                 periods=2,
                                 freq=idx.freq,
                                 tz=idx.tz,
                                 name=idx.name)
        except ValueError:  # older pandas versions
            data = [start, "1970-01-02"] if idx.freq is None else None
            return pd.DatetimeIndex(data,
                                    start=start,
                                    periods=2,
                                    freq=idx.freq,
                                    tz=idx.tz,
                                    name=idx.name)
    elif typ is pd.PeriodIndex:
        return pd.period_range(start="1970-01-01",
                               periods=2,
                               freq=idx.freq,
                               name=idx.name)
    elif typ is pd.TimedeltaIndex:
        start = np.timedelta64(1, "D")
        try:
            return pd.timedelta_range(start=start,
                                      periods=2,
                                      freq=idx.freq,
                                      name=idx.name)
        except ValueError:  # older pandas versions
            start = np.timedelta64(1, "D")
            data = [start, start + 1] if idx.freq is None else None
            return pd.TimedeltaIndex(data,
                                     start=start,
                                     periods=2,
                                     freq=idx.freq,
                                     name=idx.name)
    elif typ is pd.CategoricalIndex:
        if len(idx.categories) == 0:
            data = pd.Categorical(_nonempty_index(idx.categories),
                                  ordered=idx.ordered)
        else:
            data = pd.Categorical.from_codes([-1, 0],
                                             categories=idx.categories,
                                             ordered=idx.ordered)
        return pd.CategoricalIndex(data, name=idx.name)
    elif typ is pd.MultiIndex:
        levels = [_nonempty_index(l) for l in idx.levels]
        codes = [[0, 0] for i in idx.levels]
        try:
            return pd.MultiIndex(levels=levels, codes=codes, names=idx.names)
        except TypeError:  # older pandas versions
            return pd.MultiIndex(levels=levels, labels=codes, names=idx.names)

    raise TypeError("Don't know how to handle index of "
                    "type {0}".format(typename(type(idx))))
Esempio n. 20
0
def test_groupby_aggregate_empty_key_empty_return():
    # GH: 32580 Check if everything works, when return is empty
    df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
    result = df.groupby("a").agg({"b": []})
    expected = pd.DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []]))
    tm.assert_frame_equal(result, expected)
Esempio n. 21
0
index

s = pd.Series(np.random.randn(8), index=index)
s

#!!! BUT
pd.Series(np.random.randn(8), index=tuples)

#%% .from_product()
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]
pd.MultiIndex.from_product(iterables, names=['first', 'second'])

pd.MultiIndex.from_product([range(2), range(3), range(3)], names=['d', 'p', 'q'])

#%% 'directly'
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
                     codes=[[1, 1, 0, 0], [1, 0, 1, 0]])
midx
#%%
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
                     codes=[[1, 1, 0, 0, 1], [1, 0, 1, 0, 1]])
midx

#%%
index.names
s.index.names
df.index.names

index.levels

dir(index)  # ! a lot !
Esempio n. 22
0
"""
City   :  Institute
Delhi   : AIIT
Delhi   : ABS
Chandigarh : AIIT
Chandigarh  : ABS
Chandigarh  : ALS
"""

city=['Delhi','Delhi','Chandigarh', 'Chandigarh','Chandigarh']
institute =['AIIT', 'ABS', 'AIIT', 'ABS', 'ALS'] 
len(city), len(institute)
#------
dataForIndex = pd.DataFrame({'city':['Delhi','Delhi','Chandigarh', 'Chandigarh','Chandigarh'], 'institute':['AIIT', 'ABS', 'AIIT', 'ABS', 'ALS']})
dataForIndex
indexFromDF = pd.MultiIndex(levels=dataForIndex, codes=[[0,0,1,1,1], [0, 1, 0, 1, 2]])
midx = pd.MultiIndex(levels=[['Delhi', 'Chandigarh'], ['AIIT', 'ABS', 'ALS']],codes=[[0,0,1,1,1], [0, 1, 0, 1, 2]])
midx
d = [[100,120],[75,70],[120,105],[90,65],[80,55]]
d
df = pd.DataFrame(index=midx, columns=['Male', 'Female'], data=d)
df
df.index
df.drop(index='ABS')
df.drop(columns=['Male'], axis=1)
df.drop(index=['Delhi'], axis=0)
df.drop(index='ABS',level=1,axis=0)
df.drop(index='Delhi',level=0,axis=0)
df.drop(index=['Delhi','Chandigarh'],level=1,axis=0)  #no effect
df.drop(index=['Delhi','Chandigarh'],level=0,axis=0)  #no data
df.drop(index=['AIIT'],level=1,axis=0)  #no data
Esempio n. 23
0
 def test_make_index_str_multiple_levels(self):
     idx = self.sel.make_index('/[foo,bar]/[0:3]')
     assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'],
                                                   [0, 1, 2]],
                                           labels=[[1, 1, 1, 0, 0, 0],
                                                   [0, 1, 2, 0, 1, 2]]))
Esempio n. 24
0
        levels=[[1, 3, 4, 5], [1, 2, 5]],
        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
        names=["x", "y"],
    )
    pmidx = midx.to_pandas()

    assert_eq(midx.values_host, pmidx.values)


@pytest.mark.parametrize(
    "pdi, fill_value, expected",
    [
        (
            pd.MultiIndex(
                levels=[[1, 3, 4, None], [1, 2, 5]],
                codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                names=["x", "y"],
            ),
            5,
            pd.MultiIndex(
                levels=[[1, 3, 4, 5], [1, 2, 5]],
                codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                names=["x", "y"],
            ),
        ),
        (
            pd.MultiIndex(
                levels=[[1, 3, 4, None], [1, None, 5]],
                codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
                names=["x", "y"],
            ),
Esempio n. 25
0
 def test_make_index_list_multiple_levels(self):
     idx = self.sel.make_index([[['foo', 'bar'], slice(0, 3)]])
     assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'],
                                                   [0, 1, 2]],
                                           labels=[[1, 1, 1, 0, 0, 0],
                                                   [0, 1, 2, 0, 1, 2]]))
Esempio n. 26
0
# 다중 인덱스 생성
print('다중 인덱스 생성')
df = pd.DataFrame(np.random.rand(6, 3),
                  index=[['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]],
                  columns=['c1', 'c2', 'c3'])
print(df)
print(
    pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b', 'c', 'c'],
                               [1, 2, 1, 2, 1, 2]]))
print(
    pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2),
                               ('c', 1), ('c', 2)]))
print(pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]]))
print(
    pd.MultiIndex(levels=[['a', 'b', 'c'], [1, 2]],
                  codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]))
print()

population.index.names = ['행정구역', '년도']
print(population)
print()

idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]],
                                 names=['name1', 'name2'])
cols = pd.MultiIndex.from_product([['c1', 'c2', 'c3'], [1, 2]],
                                  names=['col_names1', 'col_names2'])
data = np.round(np.random.rand(6, 6), 2)
print(idx)
print(cols)
print(data)
mdf = pd.DataFrame(data, index=idx, columns=cols)
Esempio n. 27
0
 def test_get_index_str(self):
     idx = self.sel.get_index(self.df, '/foo/mof/*')
     assert_index_equal(
         idx,
         pd.MultiIndex(levels=[['foo'], ['mof'], [0, 1, 2]],
                       labels=[[0, 0, 0], [0, 0, 0], [0, 1, 2]]))
Esempio n. 28
0
class TestToCSV:
    @pytest.mark.xfail(
        (3, 6, 5) > sys.version_info,
        reason=(
            "Python csv library bug (see https://bugs.python.org/issue32255)"),
    )
    def test_to_csv_with_single_column(self):
        # see gh-18676, https://bugs.python.org/issue32255
        #
        # Python's CSV library adds an extraneous '""'
        # before the newline when the NaN-value is in
        # the first row. Otherwise, only the newline
        # character is added. This behavior is inconsistent
        # and was patched in https://bugs.python.org/pull_request4672.
        df1 = DataFrame([None, 1])
        expected1 = """\
""
1.0
"""
        with tm.ensure_clean("test.csv") as path:
            df1.to_csv(path, header=None, index=None)
            with open(path, "r") as f:
                assert f.read() == expected1

        df2 = DataFrame([1, None])
        expected2 = """\
1.0
""
"""
        with tm.ensure_clean("test.csv") as path:
            df2.to_csv(path, header=None, index=None)
            with open(path, "r") as f:
                assert f.read() == expected2

    def test_to_csv_defualt_encoding(self):
        # GH17097
        df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})

        with tm.ensure_clean("test.csv") as path:
            # the default to_csv encoding is uft-8.
            df.to_csv(path)
            tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)

    def test_to_csv_quotechar(self):
        df = DataFrame({"col": [1, 2]})
        expected = """\
"","col"
"0","1"
"1","2"
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1)  # 1=QUOTE_ALL
            with open(path, "r") as f:
                assert f.read() == expected

        expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1, quotechar="$")
            with open(path, "r") as f:
                assert f.read() == expected

        with tm.ensure_clean("test.csv") as path:
            with pytest.raises(TypeError, match="quotechar"):
                df.to_csv(path, quoting=1, quotechar=None)

    def test_to_csv_doublequote(self):
        df = DataFrame({"col": ['a"a', '"bb"']})
        expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=1, doublequote=True)  # QUOTE_ALL
            with open(path, "r") as f:
                assert f.read() == expected

        from _csv import Error

        with tm.ensure_clean("test.csv") as path:
            with pytest.raises(Error, match="escapechar"):
                df.to_csv(path, doublequote=False)  # no escapechar set

    def test_to_csv_escapechar(self):
        df = DataFrame({"col": ['a"a', '"bb"']})
        expected = """\
"","col"
"0","a\\"a"
"1","\\"bb\\""
"""

        with tm.ensure_clean("test.csv") as path:  # QUOTE_ALL
            df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
            with open(path, "r") as f:
                assert f.read() == expected

        df = DataFrame({"col": ["a,a", ",bb,"]})
        expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""

        with tm.ensure_clean("test.csv") as path:
            df.to_csv(path, quoting=3, escapechar="\\")  # QUOTE_NONE
            with open(path, "r") as f:
                assert f.read() == expected

    def test_csv_to_string(self):
        df = DataFrame({"col": [1, 2]})
        expected_rows = [",col", "0,1", "1,2"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected

    def test_to_csv_decimal(self):
        # see gh-781
        df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})

        expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
        expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv() == expected_default

        expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
        expected_european_excel = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(decimal=",", sep=";") == expected_european_excel

        expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
        expected_float_format_default = tm.convert_rows_list_to_csv_str(
            expected_rows)
        assert df.to_csv(float_format="%.2f") == expected_float_format_default

        expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
        expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
        assert (df.to_csv(decimal=",", sep=";",
                          float_format="%.2f") == expected_float_format)

        # see gh-11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})

        expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.to_csv(index=False, decimal="^") == expected

        # same but for an index
        assert df.set_index("a").to_csv(decimal="^") == expected

        # same for a multi-index
        assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected

    def test_to_csv_float_format(self):
        # testing if float_format is taken into account for the index
        # GH 11553
        df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})

        expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df.set_index("a").to_csv(float_format="%.2f") == expected

        # same for a multi-index
        assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected

    def test_to_csv_na_rep(self):
        # see gh-11553
        #
        # Testing if NaN values are correctly represented in the index.
        df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        # now with an index containing only NaNs
        df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        # check if na_rep parameter does not break anything when no NaN
        df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
        expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index("a").to_csv(na_rep="_") == expected
        assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected

        # GH 29975
        # Make sure full na_rep shows up when a dtype is provided
        csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
        expected = tm.convert_rows_list_to_csv_str(
            [",0", "0,a", "1,ZZZZZ", "2,c"])
        assert expected == csv
        csv = pd.Series(["a", pd.NA, "c"],
                        dtype="string").to_csv(na_rep="ZZZZZ")
        assert expected == csv

    def test_to_csv_date_format(self):
        # GH 10209
        df_sec = DataFrame(
            {"A": pd.date_range("20130101", periods=5, freq="s")})
        df_day = DataFrame(
            {"A": pd.date_range("20130101", periods=5, freq="d")})

        expected_rows = [
            ",A",
            "0,2013-01-01 00:00:00",
            "1,2013-01-01 00:00:01",
            "2,2013-01-01 00:00:02",
            "3,2013-01-01 00:00:03",
            "4,2013-01-01 00:00:04",
        ]
        expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv() == expected_default_sec

        expected_rows = [
            ",A",
            "0,2013-01-01 00:00:00",
            "1,2013-01-02 00:00:00",
            "2,2013-01-03 00:00:00",
            "3,2013-01-04 00:00:00",
            "4,2013-01-05 00:00:00",
        ]
        expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_day.to_csv(
            date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day

        expected_rows = [
            ",A",
            "0,2013-01-01",
            "1,2013-01-01",
            "2,2013-01-01",
            "3,2013-01-01",
            "4,2013-01-01",
        ]
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec

        expected_rows = [
            ",A",
            "0,2013-01-01",
            "1,2013-01-02",
            "2,2013-01-03",
            "3,2013-01-04",
            "4,2013-01-05",
        ]
        expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
        assert df_day.to_csv() == expected_default_day
        assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day

        # see gh-7791
        #
        # Testing if date_format parameter is taken into account
        # for multi-indexed DataFrames.
        df_sec["B"] = 0
        df_sec["C"] = 1

        expected_rows = ["A,B,C", "2013-01-01,0,1"]
        expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)

        df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
        assert df_sec_grouped.mean().to_csv(
            date_format="%Y-%m-%d") == expected_ymd_sec

    def test_to_csv_multi_index(self):
        # see gh-6618
        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [",1", ",2", "0,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["1", "2", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame(
            [1],
            columns=pd.MultiIndex.from_arrays([[1], [2]]),
            index=pd.MultiIndex.from_arrays([[1], [2]]),
        )

        exp_rows = [",,1", ",,2", "1,2,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["1", "2", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame([1],
                       columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))

        exp_rows = [",foo", ",bar", "0,1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ["foo", "bar", "1"]
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

    @pytest.mark.parametrize(
        "ind,expected",
        [
            (
                pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
                "x,data\n1.0,1\n",
            ),
            (
                pd.MultiIndex(
                    levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]),
                "x,y,data\n1.0,2.0,1\n",
            ),
        ],
    )
    @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series])
    def test_to_csv_single_level_multi_index(self, ind, expected, klass):
        # see gh-19589
        result = klass(pd.Series([1], ind,
                                 name="data")).to_csv(line_terminator="\n",
                                                      header=True)
        assert result == expected

    def test_to_csv_string_array_ascii(self):
        # GH 10813
        str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
        df = pd.DataFrame(str_array)
        expected_ascii = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
        with tm.ensure_clean("str_test.csv") as path:
            df.to_csv(path, encoding="ascii")
            with open(path, "r") as f:
                assert f.read() == expected_ascii

    def test_to_csv_string_array_utf8(self):
        # GH 10813
        str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
        df = pd.DataFrame(str_array)
        expected_utf8 = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
        with tm.ensure_clean("unicode_test.csv") as path:
            df.to_csv(path, encoding="utf-8")
            with open(path, "r") as f:
                assert f.read() == expected_utf8

    def test_to_csv_string_with_lf(self):
        # GH 20353
        data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
        df = pd.DataFrame(data)
        with tm.ensure_clean("lf_test.csv") as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode("utf-8")
            expected_noarg = (b"int,str_lf" + os_linesep + b"1,abc" +
                              os_linesep + b'2,"d\nef"' + os_linesep +
                              b'3,"g\nh\n\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean("lf_test.csv") as path:
            # case 2: LF as line terminator
            expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
            df.to_csv(path, line_terminator="\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_lf
        with tm.ensure_clean("lf_test.csv") as path:
            # case 3: CRLF as line terminator
            # 'line_terminator' should not change inner element
            expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
            df.to_csv(path, line_terminator="\r\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_crlf

    def test_to_csv_string_with_crlf(self):
        # GH 20353
        data = {
            "int": [1, 2, 3],
            "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]
        }
        df = pd.DataFrame(data)
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 1: The default line terminator(=os.linesep)(PR 21406)
            os_linesep = os.linesep.encode("utf-8")
            expected_noarg = (b"int,str_crlf" + os_linesep + b"1,abc" +
                              os_linesep + b'2,"d\r\nef"' + os_linesep +
                              b'3,"g\r\nh\r\n\r\ni"' + os_linesep)
            df.to_csv(path, index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_noarg
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 2: LF as line terminator
            expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
            df.to_csv(path, line_terminator="\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_lf
        with tm.ensure_clean("crlf_test.csv") as path:
            # case 3: CRLF as line terminator
            # 'line_terminator' should not change inner element
            expected_crlf = (b"int,str_crlf\r\n"
                             b"1,abc\r\n"
                             b'2,"d\r\nef"\r\n'
                             b'3,"g\r\nh\r\n\r\ni"\r\n')
            df.to_csv(path, line_terminator="\r\n", index=False)
            with open(path, "rb") as f:
                assert f.read() == expected_crlf

    def test_to_csv_stdout_file(self, capsys):
        # GH 21561
        df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]],
                          columns=["name_1", "name_2"])
        expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
        expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)

        df.to_csv(sys.stdout, encoding="ascii")
        captured = capsys.readouterr()

        assert captured.out == expected_ascii
        assert not sys.stdout.closed

    @pytest.mark.xfail(
        compat.is_platform_windows(),
        reason=("Especially in Windows, file stream should not be passed"
                "to csv writer without newline='' option."
                "(https://docs.python.org/3.6/library/csv.html#csv.writer)"),
    )
    def test_to_csv_write_to_open_file(self):
        # GH 21696
        df = pd.DataFrame({"a": ["x", "y", "z"]})
        expected = """\
manual header
x
y
z
"""
        with tm.ensure_clean("test.txt") as path:
            with open(path, "w") as f:
                f.write("manual header\n")
                df.to_csv(f, header=None, index=None)
            with open(path, "r") as f:
                assert f.read() == expected

    def test_to_csv_write_to_open_file_with_newline_py3(self):
        # see gh-21696
        # see gh-20353
        df = pd.DataFrame({"a": ["x", "y", "z"]})
        expected_rows = ["x", "y", "z"]
        expected = "manual header\n" + tm.convert_rows_list_to_csv_str(
            expected_rows)
        with tm.ensure_clean("test.txt") as path:
            with open(path, "w", newline="") as f:
                f.write("manual header\n")
                df.to_csv(f, header=None, index=None)

            with open(path, "rb") as f:
                assert f.read() == bytes(expected, "utf-8")

    @pytest.mark.parametrize("to_infer", [True, False])
    @pytest.mark.parametrize("read_infer", [True, False])
    def test_to_csv_compression(self, compression_only, read_infer, to_infer):
        # see gh-15008
        compression = compression_only

        if compression == "zip":
            pytest.skip(f"{compression} is not supported for to_csv")

        # We'll complete file extension subsequently.
        filename = "test."

        if compression == "gzip":
            filename += "gz"
        else:
            # xz --> .xz
            # bz2 --> .bz2
            filename += compression

        df = DataFrame({"A": [1]})

        to_compression = "infer" if to_infer else compression
        read_compression = "infer" if read_infer else compression

        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression=to_compression)
            result = pd.read_csv(path,
                                 index_col=0,
                                 compression=read_compression)
            tm.assert_frame_equal(result, df)

    def test_to_csv_compression_dict(self, compression_only):
        # GH 26023
        method = compression_only
        df = DataFrame({"ABC": [1]})
        filename = "to_csv_compress_as_dict."
        filename += "gz" if method == "gzip" else method
        with tm.ensure_clean(filename) as path:
            df.to_csv(path, compression={"method": method})
            read_df = pd.read_csv(path, index_col=0)
            tm.assert_frame_equal(read_df, df)

    def test_to_csv_compression_dict_no_method_raises(self):
        # GH 26023
        df = DataFrame({"ABC": [1]})
        compression = {"some_option": True}
        msg = "must have key 'method'"

        with tm.ensure_clean("out.zip") as path:
            with pytest.raises(ValueError, match=msg):
                df.to_csv(path, compression=compression)

    @pytest.mark.parametrize("compression", ["zip", "infer"])
    @pytest.mark.parametrize("archive_name",
                             [None, "test_to_csv.csv", "test_to_csv.zip"])
    def test_to_csv_zip_arguments(self, compression, archive_name):
        # GH 26023
        from zipfile import ZipFile

        df = DataFrame({"ABC": [1]})
        with tm.ensure_clean("to_csv_archive_name.zip") as path:
            df.to_csv(path,
                      compression={
                          "method": compression,
                          "archive_name": archive_name
                      })
            zp = ZipFile(path)
            expected_arcname = path if archive_name is None else archive_name
            expected_arcname = os.path.basename(expected_arcname)
            assert len(zp.filelist) == 1
            archived_file = os.path.basename(zp.filelist[0].filename)
            assert archived_file == expected_arcname

    @pytest.mark.parametrize("df_new_type", ["Int64"])
    def test_to_csv_na_rep_long_string(self, df_new_type):
        # see gh-25099
        df = pd.DataFrame({"c": [float("nan")] * 3})
        df = df.astype(df_new_type)
        expected_rows = ["c", "mynull", "mynull", "mynull"]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")

        assert expected == result

    def test_to_csv_timedelta_precision(self):
        # GH 6783
        s = pd.Series([1, 1]).astype("timedelta64[ns]")
        buf = io.StringIO()
        s.to_csv(buf)
        result = buf.getvalue()
        expected_rows = [
            ",0",
            "0,0 days 00:00:00.000000001",
            "1,0 days 00:00:00.000000001",
        ]
        expected = tm.convert_rows_list_to_csv_str(expected_rows)
        assert result == expected

    def test_na_rep_truncated(self):
        # https://github.com/pandas-dev/pandas/issues/31447
        result = pd.Series(range(8, 12)).to_csv(na_rep="-")
        expected = tm.convert_rows_list_to_csv_str(
            [",0", "0,8", "1,9", "2,10", "3,11"])
        assert result == expected

        result = pd.Series([True, False]).to_csv(na_rep="nan")
        expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
        assert result == expected

        result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
        expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
        assert result == expected
Esempio n. 29
0
def test_filter_meta_index(test_df):
    obs = test_df.filter(scenario='scen_b').meta.index
    exp = pd.MultiIndex(levels=[['model_a'], ['scen_b']],
                        codes=[[0], [0]],
                        names=['model', 'scenario'])
    pd.testing.assert_index_equal(obs, exp)
Esempio n. 30
0
    def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
        """Convert all index coordinates into a :py:class:`pandas.Index`.

        Parameters
        ----------
        ordered_dims : sequence of hashable, optional
            Possibly reordered version of this object's dimensions indicating
            the order in which dimensions should appear on the result.

        Returns
        -------
        pandas.Index
            Index subclass corresponding to the outer-product of all dimension
            coordinates. This will be a MultiIndex if this object is has more
            than more dimension.
        """
        if ordered_dims is None:
            ordered_dims = list(self.dims)
        elif set(ordered_dims) != set(self.dims):
            raise ValueError("ordered_dims must match dims, but does not: "
                             "{} vs {}".format(ordered_dims, self.dims))

        if len(ordered_dims) == 0:
            raise ValueError("no valid index for a 0-dimensional object")
        elif len(ordered_dims) == 1:
            (dim, ) = ordered_dims
            return self._data.get_index(dim)  # type: ignore
        else:
            indexes = [self._data.get_index(k)
                       for k in ordered_dims]  # type: ignore

            # compute the sizes of the repeat and tile for the cartesian product
            # (taken from pandas.core.reshape.util)
            index_lengths = np.fromiter((len(index) for index in indexes),
                                        dtype=np.intp)
            cumprod_lengths = np.cumproduct(index_lengths)

            if cumprod_lengths[-1] != 0:
                # sizes of the repeats
                repeat_counts = cumprod_lengths[-1] / cumprod_lengths
            else:
                # if any factor is empty, the cartesian product is empty
                repeat_counts = np.zeros_like(cumprod_lengths)

            # sizes of the tiles
            tile_counts = np.roll(cumprod_lengths, 1)
            tile_counts[0] = 1

            # loop over the indexes
            # for each MultiIndex or Index compute the cartesian product of the codes

            code_list = []
            level_list = []
            names = []

            for i, index in enumerate(indexes):
                if isinstance(index, pd.MultiIndex):
                    codes, levels = index.codes, index.levels
                else:
                    code, level = pd.factorize(index)
                    codes = [code]
                    levels = [level]

                # compute the cartesian product
                code_list += [
                    np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i])
                    for code in codes
                ]
                level_list += levels
                names += index.names

            return pd.MultiIndex(level_list, code_list, names=names)