Example #1
0
    def test_ambiguous_warns(self):
        df = DataFrame({"A": [1, 2]})
        with tm.assert_produces_warning(FutureWarning):
            df.rename(id, id)

        with tm.assert_produces_warning(FutureWarning):
            df.rename({0: 10}, {"A": "B"})
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
Example #3
0
class Rename(object):

    def setup(self):
        N = 10**3
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.idx = np.arange(4 * N, 7 * N)
        self.dict_idx = {k: k for k in self.idx}
        self.df2 = DataFrame(
            {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
                 1: np.random.randint(0, N, N).astype(np.int16),
                 2: np.random.randint(0, N, N).astype(np.int32),
                 3: np.random.randint(0, N, N).astype(np.int64)}
                [np.random.randint(0, 4)] for c in range(N)})

    def time_rename_single(self):
        self.df.rename({0: 0})

    def time_rename_axis0(self):
        self.df.rename(self.dict_idx)

    def time_rename_axis1(self):
        self.df.rename(columns=self.dict_idx)

    def time_rename_both_axes(self):
        self.df.rename(index=self.dict_idx, columns=self.dict_idx)

    def time_dict_rename_both_axes(self):
        self.df.rename(index=self.dict_idx, columns=self.dict_idx)
Example #4
0
    def trialToOneRow(dfs, lonumberfields, oldnewsamsecnamepairs):
        collapsedFields = {field: [dfs[field].values]
                        for field in lonumberfields}
        collapsedTrial = DataFrame(collapsedFields)
        collapsedTrial.rename(columns=oldnewsamsecnamepairs, inplace=True)

        return collapsedTrial
def change_axis01():
    data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
    print data
    print data.index.map(str.upper)
    data.index=data.index.map(str.upper)
    print data
    print data.rename(index=str.title,columns=str.upper)
    print data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'},inplace=True)
    print data
Example #6
0
def featureize(df, vectorizers, verbose=False):
    """
      Featurize an enhanced http dataframe

      Parameters
      ----------
      df : dataframe
          The enhanced HTTP log dataframe
      vectorizers : {String -> TfidfVectorizer}
            A map of feature -> vectorizer
      verbose: boolean, optional
          Controls Verbosity level

      Returns
      -------
      featureMatrix : numeric dataframe
            A featurized dataframe

    """
    if verbose: print('\nExtracting features')
    
    bow_features = []
    #featurize using the vectorizers.
    
    for feature in ['user_agent','uri','referrer','host', 'subdomain', 'method','status_code','resp_p_str', 'URIparams', 'browser_string', 'tld']:
        if verbose: print('Featurizing %s' % feature)
        single_feature_matrix = vectorizers[feature].transform(df[feature].astype(str))
        if verbose: print('  Dim of %s: %s' % (feature,single_feature_matrix.shape[1]))
        single_df = DataFrame(single_feature_matrix.toarray())
        single_df.rename(columns=lambda x: feature+"."+vectorizers[feature].get_feature_names()[x], inplace=True)
        bow_features.append(single_df)

    featureMatrix = pd.concat(bow_features, axis=1)
    
    #add some other numeric features that are functions of columns
    featureMatrix['domainNameLength'] = df['host'].apply(len)
    featureMatrix['domainNameDots'] = df['host'].apply(lambda dn: dn.count('.'))
    featureMatrix['uriSlashes'] = df['uri'].apply(lambda dn: dn.count('/'))
    featureMatrix['userAgentLength'] = df['user_agent'].apply(len)
    featureMatrix['userAgentEntropy'] = df['user_agent'].apply(H)
    featureMatrix['subdomainEntropy'] = df['subdomain'].apply(H)
    featureMatrix['request_body_len'] = df['request_body_len']
    featureMatrix['response_body_len'] = df['response_body_len']
    featureMatrix['referrerPresent'] = df['referrer'].apply(lambda r: 0.0 if (r=='-') else 1.0)
    
    def countParams(uri):
        fullUri = 'http://bogus.com/'+uri
        parseResult = parse_qs(urlparse(fullUri).query)
        return len(parseResult)
    
    featureMatrix['numURIParams'] = df['uri'].apply(countParams)
    featureMatrix['URIParamsKeyEntropy'] = df['URIparams'].apply(H)
    featureMatrix['URIParamsTokensEntropy'] = df['URItokens'].apply(H)

    if verbose: print('Feature matrix generated with %s columns' % featureMatrix.shape[1])

    return featureMatrix
def imputation_loyer(year):

    erf = create_comparable_erf_data_frame(year)
    erf = erf[['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident']]
    erf = erf.dropna(how = 'any')  # TODO : faire un check avant de dropper les lignes avec des NA

    Logt = create_comparable_logement_data_frame(year)

    Logt = Logt.dropna(how = 'any')

    allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci']
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars) - set(classes))

    for variable in allvars:
        count_NA(variable, Logt)
        count_NA(variable, erf)

    erf['mcs8'] = erf['mcs8'].astype(int)

    rpy2.robjects.pandas2ri.activate()  # Permet à rpy2 de convertir les dataframes   padas2ri doesn't exist anymore in rpy2
#    com.convert_to_r_dataframe() TODO: Probablement à supprimer
    try:
        sm = importr("StatMatch")  # Launch R you need to have StatMatch installed in R
    except:
        sm = importr("StatMatch", lib_loc = STATMATCH_LIB_LOCATION)
    out_nnd = sm.NND_hotdeck(data_rec = erf,
                             data_don = Logt,
                             match_vars = vectors.StrVector(matchvars),
                             don_class = vectors.StrVector(classes),
                             dist_fun = "Gower",
                             )
    fill_erf_nnd = sm.create_fused(data_rec = erf,
                                   data_don = Logt,
                                   mtc_ids = out_nnd[0],
                                   z_vars = vectors.StrVector(["lmlm"]),
                                   )
    del allvars, matchvars, classes, out_nnd
    gc.collect()

    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    fill_erf_nnd.rename(columns={'lmlm': 'loym'}, inplace = True)

    loy_imput = fill_erf_nnd[['ident', 'loym']]

    erfmenm = load_temp(name = "menagem", year = year)

    for var in ["loym", "loym_x", "loym_y", "loym_z"]:
        if var in erfmenm:
            del erfmenm[var]
            log.info("{} have been deleted".format(var))

    erfmenm = erfmenm.merge(loy_imput, on='ident', how='left')
    assert 'loym' in erfmenm.columns, u"La variable loym n'est pas présente dans erfmenm"
    save_temp(erfmenm, name = "menagem", year=year)
Example #8
0
    def test_rename_axis_style(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y'])
        expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y'])

        result = df.rename(str.lower, axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.rename(str.lower, axis='columns')
        tm.assert_frame_equal(result, expected)

        result = df.rename({"A": 'a', 'B': 'b'}, axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.rename({"A": 'a', 'B': 'b'}, axis='columns')
        tm.assert_frame_equal(result, expected)

        # Index
        expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y'])
        result = df.rename(str.lower, axis=0)
        tm.assert_frame_equal(result, expected)

        result = df.rename(str.lower, axis='index')
        tm.assert_frame_equal(result, expected)

        result = df.rename({'X': 'x', 'Y': 'y'}, axis=0)
        tm.assert_frame_equal(result, expected)

        result = df.rename({'X': 'x', 'Y': 'y'}, axis='index')
        tm.assert_frame_equal(result, expected)

        result = df.rename(mapper=str.lower, axis='index')
        tm.assert_frame_equal(result, expected)
Example #9
0
def append_village_areas(divname):
    im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower())
    shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()),
                               lat_offset, lon_offset)
    areas = shape_helper.get_shape_areas('village')
    areas_df = DataFrame(areas, index=['area'])
    areas_df = areas_df.transpose()
    areas_df.reset_index(inplace=True)
    areas_df.rename(columns={'index': 'village'}, inplace=True)
    im_vil_areas = pd.merge(im_vil, areas_df, how='left')
    im_vil_areas.set_index('image', inplace=True)
    im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
    def obs_for_station_as_df(self, station_id, datatype, limit):
    	obs_results = self.obs_for_station_q(station_id, datatype, limit)
    	if not FbStationsApi.check_ok(obs_results, "obs_for_station_q"):
    		return None

    	obs_data = obs_results.json()['results']
    	if bool(obs_data):
    		obs_results_df = DataFrame(obs_data)
    		obs_results_df.rename(columns = {'value' : datatype}, inplace=True)
    		obs_results_df['date'] = obs_results_df['date'].astype('datetime64[ns]')
    		return obs_results_df
    	else:
    		return None
    def forecast_as_df(self, lat, lon, datatype, limit):
    	forecast_results = self.forecast_q(lat, lon, limit)
    	if not FbForecastApi.check_ok(forecast_results, "forecast_q"):
    		return None

    	forecast_data = forecast_results.json()['results']
    	if bool(forecast_data):
    		forecast_results_df = DataFrame(forecast_data)
    		forecast_results_df.rename(columns = {'value' : datatype}, inplace=True)
    		forecast_results_df['forecast_date'] = forecast_results_df['forecast_date'].astype('datetime64[ns]')
    		forecast_results_df['model_date'] = forecast_results_df['model_date'].astype('datetime64[ns]')
    		return forecast_results_df
    	else:
    		return None
Example #13
0
def get_cpu_sw_map(dfds, cap_time_usec, task_re):
    df_list = []
    dfsw_list = []
    for dfd in dfds:
        df = filter_df_core(dfd.df, task_re, True)
        # at this point we have a set of df that look like this:
        #         task_name  duration
        # 0     ASA.1.vcpu0      7954
        # 1     ASA.1.vcpu0      5475
        # 2     ASA.1.vcpu0      4151
        if df.empty:
            continue
        gb = df.groupby("task_name", as_index=False)

        # sum all duration for each task
        df = gb.aggregate(np.sum)
        if dfd.multiplier > 1.0:
            df["duration"] = (df["duration"] * dfd.multiplier).astype(int)
        df["percent"] = ((df["duration"] * 100 * 10) // cap_time_usec) / 10
        if len(dfds) > 1:
            df["task_name"] = df["task_name"] + "." + dfd.short_name
        df_list.append(df)

        # count number of rows with same task and cpu
        dfsw = DataFrame(gb.size())
        dfsw.reset_index(inplace=True)
        dfsw.rename(columns={0: "count"}, inplace=True)

        if dfd.multiplier > 1.0:
            dfsw["count"] = (dfsw["count"] * dfd.multiplier).astype(int)
        else:
            dfsw["count"] = dfsw["count"].astype(int)
        dfsw_list.append(dfsw)

    if not df_list:
        return None

    df = pandas.concat(df_list)
    df = df.drop("duration", axis=1)
    dfsw = pandas.concat(dfsw_list)
    df = pandas.merge(df, dfsw, on="task_name")
    # Result:
    #             task_name  percent  count
    # 0  ASA.01.vcpu0.1x218     72.0  1998
    # 1  ASA.01.vcpu0.2x208     61.8  2128
    # 2  ASA.02.vcpu0.2x208     58.9  2177

    # transform this into a dict where the key is the task_name and the value
    # is a list [percent, count]
    return df.set_index("task_name").T.to_dict("list")
Example #14
0
 def fix_tickets(
         self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame:
     ticket_frame.rename(
         columns={'Total changed lines': 'ChangedLines'}, inplace=True)
     ticket_frame = ticket_frame[
         ticket_frame.ChangedLines < 100000]
     ticket_frame = ticket_frame.assign(
         ChangedFiles=ticket_frame['Changed files'].apply(
         partial(self.fix_path_prefixes, path_fixes)))
     fixed_frame = ticket_frame.drop(
         'Changed files', axis=1).sort_values(
         by='CommitDate').reset_index(drop=True)
     fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True)
     return fixed_frame
Example #15
0
    def test_rename(self):
        mapping = {
            'A': 'a',
            'B': 'b',
            'C': 'c',
            'D': 'd'
        }

        renamed = self.frame.rename(columns=mapping)
        renamed2 = self.frame.rename(columns=str.lower)

        assert_frame_equal(renamed, renamed2)
        assert_frame_equal(renamed2.rename(columns=str.upper),
                           self.frame, check_names=False)

        # index
        data = {
            'A': {'foo': 0, 'bar': 1}
        }

        # gets sorted alphabetical
        df = DataFrame(data)
        renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'})
        tm.assert_index_equal(renamed.index, pd.Index(['foo', 'bar']))

        renamed = df.rename(index=str.upper)
        tm.assert_index_equal(renamed.index, pd.Index(['BAR', 'FOO']))

        # have to pass something
        pytest.raises(TypeError, self.frame.rename)

        # partial columns
        renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'})
        tm.assert_index_equal(renamed.columns,
                              pd.Index(['A', 'B', 'foo', 'bar']))

        # other axis
        renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'})
        tm.assert_index_equal(renamed.index,
                              pd.Index(['A', 'B', 'foo', 'bar']))

        # index with name
        index = Index(['foo', 'bar'], name='name')
        renamer = DataFrame(data, index=index)
        renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
        tm.assert_index_equal(renamed.index,
                              pd.Index(['bar', 'foo'], name='name'))
        assert renamed.index.name == renamer.index.name
Example #16
0
    def test_convert_dummies(self):
        df = DataFrame(
            {
                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
                "C": np.random.randn(8),
                "D": np.random.randn(8),
            }
        )

        with tm.assert_produces_warning(FutureWarning):
            result = convert_dummies(df, ["A", "B"])
            result2 = convert_dummies(df, ["A", "B"], prefix_sep=".")

        expected = DataFrame(
            {
                "A_foo": [1, 0, 1, 0, 1, 0, 1, 1],
                "A_bar": [0, 1, 0, 1, 0, 1, 0, 0],
                "B_one": [1, 1, 0, 0, 0, 0, 1, 0],
                "B_two": [0, 0, 1, 0, 1, 1, 0, 0],
                "B_three": [0, 0, 0, 1, 0, 0, 0, 1],
                "C": df["C"].values,
                "D": df["D"].values,
            },
            columns=result.columns,
            dtype=float,
        )
        expected2 = expected.rename(columns=lambda x: x.replace("_", "."))

        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(result2, expected2)
Example #17
0
def merge_by_year(year):
	"merge the two dataframe based on Country"

	data1 = Df(transIncome.ix[year])
	data1 = data1.rename(columns = {year:'Income'})
	data1['Country'] = transIncome.columns
	mergedData = pd.merge(data1,countries,on = ['Country'])
	return mergedData
Example #18
0
    def group_by_time_transform(self):
        dp_df_columns = ['data_date','indicator_id','location_id','value']
        time_grouping =  self.parsed_params['group_by_time']

        # HACKK
        if self.parsed_params['chart_uuid'] ==\
            '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be':

            self.parsed_params['show_missing_data'] = 1
            return self.handle_polio_case_table(dp_df_columns)

        cols = ['data_date','indicator_id','location_id','value']
        dp_df = DataFrame(list(DataPoint.objects.filter(
            location_id__in = self.location_ids,
            indicator_id__in = self.parsed_params['indicator__in']
        ).values(*cols)),columns=cols)

        if not dp_df.empty:
            dp_df = self.get_time_group_series(dp_df)
            gb_df = DataFrame(dp_df\
                .groupby(['indicator_id','time_grouping','location_id'])['value']\
                .sum())\
                .reset_index()
            return gb_df
        # need to look at sublocations if the data isn't available at the current level
        else:
            depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids
            while dp_df.empty and depth_level < max_depth:
                sub_location_ids = Location.objects\
                    .filter(parent_location_id__in=sub_location_ids)\
                    .values_list('id', flat=True)

                dp_df = DataFrame(list(DataPoint.objects.filter(
                    location_id__in = sub_location_ids,
                    indicator_id__in = self.parsed_params['indicator__in']
                ).values(*cols)),columns=cols)
                depth_level += 1

            dp_df = self.get_time_group_series(dp_df)
            if dp_df.empty:
                return []
            location_tree_df = DataFrame(list(LocationTree.objects\
                .filter(location_id__in = sub_location_ids)\
                .values_list('location_id','parent_location_id')),\
                    columns=['location_id','parent_location_id'])

            merged_df = dp_df.merge(location_tree_df)
            filtered_df = merged_df[merged_df['parent_location_id']\
                .isin(self.location_ids)]

            gb_df = DataFrame(filtered_df\
                .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\
                .sum())\
                .reset_index()

            gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'})
            return gb_df
def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust):
    '''
    Derives intelligence out of MTC1 data 
    on customer returns. 
    '''
    print('*'*100)
    print('Creating summary of returns.')
    print('*'*100)
    
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    print('\n\n\nAggregating tidy dataset.')
    customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)
    
    print('Reorder columns for readability.\n\n\n')
    reorder_cols = ['CustomerId','Customer','Returns|count',
                    'PercentSales','DollarSales|bycustomer',
                    'DollarsReturned|sum','DollarsReturned|avg',
                    'CasesReturned|sum','CasesReturned|avg',
                    'OnPremise','Latitude','Longitude']
    customer_returns = customer_returns[reorder_cols]
    
    print('*'*100)
    print('Finished summarizing returns.')
    print('*'*100)
    
    return customer_returns
Example #20
0
    def test_frame_describe_tupleindex(self):

        # GH 14848 - regression from 0.19.0 to 0.19.1
        df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
                         'y': [10, 20, 30, 40, 50] * 3,
                         'z': [100, 200, 300, 400, 500] * 3})
        df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
        df2 = df1.rename(columns={'k': 'key'})
        pytest.raises(ValueError, lambda: df1.groupby('k').describe())
        pytest.raises(ValueError, lambda: df2.groupby('key').describe())
Example #21
0
    def test_insert_column_bug_4032(self):

        # GH4032, inserting a column and renaming causing errors
        df = DataFrame({'b': [1.1, 2.2]})
        df = df.rename(columns={})
        df.insert(0, 'a', [1, 2])

        result = df.rename(columns={})
        str(result)
        expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
        assert_frame_equal(result, expected)
        df.insert(0, 'c', [1.3, 2.3])

        result = df.rename(columns={})
        str(result)

        expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
                             columns=['c', 'a', 'b'])
        assert_frame_equal(result, expected)
Example #22
0
 def test_rename_positional(self):
     df = DataFrame(columns=['A', 'B'])
     with tm.assert_produces_warning(FutureWarning) as rec:
         result = df.rename(None, str.lower)
     expected = DataFrame(columns=['a', 'b'])
     tm.assert_frame_equal(result, expected)
     assert len(rec) == 1
     message = str(rec[0].message)
     assert 'rename' in message
     assert 'Use named arguments' in message
Example #23
0
    def test_rename_bug2(self):
        # GH 19497
        # rename was changing Index to MultiIndex if Index contained tuples

        df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)],
                       columns=["a"])
        df = df.rename({(1, 1): (5, 4)}, axis="index")
        expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)],
                             columns=["a"])
        assert_frame_equal(df, expected)
def nearestNeighborsSetup(filename, stateList):
  df_specimens = formatChecker(filename)
  print 'Getting the weather stations'
  with open('input/acis_station_ID.pickle') as f:
      weatherStationsMetaData = cPickle.load(f)

  # weatherStationsMetaData = weatherStations(stateList)
  # weatherStationsMetaData = read_csv('weatherStation/acis_station_ID.csv')
  df_stations = DataFrame.from_dict(weatherStationsMetaData, orient='index', dtype=None)
  '''Loads the lat/long coordinates of the specimens and weather stations into numpy arrays.
  NearestNeighborsResults() will return he number of K (nearest stations) with the index value.
  Then index will be replaced by the UID to match the ASIC data serve.'''
	#Number of points
  np1 = np.array(df_specimens['longitude']).size
  np2 = np.array(df_stations['longitude']).size

  #Search radius
  r = .25

  #Number of nearest stations returned
  k = 10

  d1 = np.empty((np1, 2))
  d2 = np.empty((np2, 2))
  d1[:, 0] = np.array(df_specimens['latitude'])
  d1[:, 1] = np.array(df_specimens['longitude'])

  d2[:, 0] = np.array(df_stations['latitude'])
  d2[:, 1] = np.array(df_stations['longitude'])
 
  result, distance = nearestNeighborsResults(d1.copy(), d2.copy(), r, k)
  columnindex = []
  closestStationList = [nearestNeighborsColumnString(x) for x in range(k)]
  for f in closestStationList: columnindex.append(f()),
  #temp variable for 0-N array
  t1 = np.arange(np2)
  #temp variable for 'uid' ID
  t2 = np.array(df_stations['uid'])
  df_results = DataFrame(result, columns=columnindex)
  #Creates a Pandas DataFrame
  uid_index = DataFrame({'0_closest_weather_station':  t1,
    'uid': t2})

  for index, column_name in enumerate(columnindex):
    temp = uid_index.rename(columns={'0_closest_weather_station': column_name, 'uid': column_name + "s"})
    df_results = df_results.reset_index().merge(temp, how='left', on= column_name, sort=False).sort('index')
    
    if index != 0:
      del df_results['level_0']

    del df_results[column_name]

  del df_results['index']
  df_results = df_results.reset_index()
  return concat([df_specimens, df_results], axis=1), distance, weatherStationsMetaData
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Example #26
0
def get_plate_data(path,c):
    """ Get plate data, drop empty columns, drop selected columns, 
        rename columns, add normalized columns. """
    return thread_first(path,
                        from_file,
                        (str.replace,'\r',''),
                        StringIO,
                        pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']),
                        df.dropna(axis=1,how='all'),
                        (drop_matching_columns,c['dropcols']),
                        df.rename(columns=c['colrename']),
                        (add_normalized_columns,c['normcols']))
Example #27
0
    def parse_data(self, articles):
        """ Responsible to parse articles in order to extract data.
        Data is extracted as a DataFrame containing the following columns:
        - Article metadata: only the metadata defined in self.metadata_column are extracted
        - Article tags: all tags are extracted, the name defined in self.tags_column are used to rename columns
        Data is indexed by a generated ID (integer).

        :param articles: The articles to parse.
        """
        tags = []
        metadata = []
        # TODO not the more efficient way to do that I think.
        for article in articles:
            if hasattr(article, "tags"):
                # Extracting all tags name from an article and putting them in a Series
                tags.append(
                    Series([tag.name for tag in article.tags], ["tag_" + str(x) for x in range(len(article.tags))])
                )
            # Selecting metadata, only the ones specified in the columns
            metadata.append(
                Series(
                    dict([(i, article.metadata[i]) for i in self.metadata_columns if i in article.metadata]),
                    self.metadata_columns,
                )
            )
        # Creating the tags DataFrame
        tags_data_frame = DataFrame(tags)
        # Renaming columns, leaving the remaining ones with the generated name "tag_"
        # Mapping current column names to the new ones in order to make a replacement
        if self.tag_columns is not None:
            replacement = dict(zip(tags_data_frame.columns.get_values()[: len(self.tag_columns)], self.tag_columns))
            # Inplace means no copy
            tags_data_frame.rename(columns=replacement, inplace=True)
        # Creating the metadata DataFrame
        metadata_data_frame = DataFrame(metadata)
        # Replacing data in column category by its string value
        # TODO maybe a better way to do that, it seems a bit ugly
        metadata_data_frame["category"] = metadata_data_frame["category"].apply(lambda x: str(x))
        # Merging the two DataFrame together
        self.data = metadata_data_frame.join(tags_data_frame)
Example #28
0
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)
Example #29
0
def test_frame_describe_tupleindex():

    # GH 14848 - regression from 0.19.0 to 0.19.1
    df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
                     'y': [10, 20, 30, 40, 50] * 3,
                     'z': [100, 200, 300, 400, 500] * 3})
    df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
    df2 = df1.rename(columns={'k': 'key'})
    msg = "Names should be list-like for a MultiIndex"
    with pytest.raises(ValueError, match=msg):
        df1.groupby('k').describe()
    with pytest.raises(ValueError, match=msg):
        df2.groupby('key').describe()
Example #30
0
    def get_results_dataframe(self, default = False, index_by_code = False):
        '''
        Formats data into a dataframe
        '''
        datas = self._compute()
        self._compute_uc()
        uc = self.uc
        dfs = dict()

        for scenario, dico in datas.iteritems():
            data = dico['data']
            data_default = dico['default']

            data_dict = dict()
            index = []
            
            if default is True:
                data = data_default
            
            for row in data:
                if not row.desc in ('root'):
                    if row.code == 'revdisp':
                        revdisp = row.vals
                    if index_by_code is True:
                        index.append(row.code)
                        data_dict[row.code] = row.vals
                    else:
                        index.append(row.desc)
                        data_dict[row.desc] = row.vals
            
            df = DataFrame(data_dict).T
            
            df = df.reindex(index)
            df = df.rename(columns = {0: scenario})
            nivvie = revdisp/uc[scenario] # TODO: include savings !!
            df = concat([df, 
                         DataFrame({scenario: nivvie}, index=['nivvie'])
                         ])
            dfs[scenario] = df
            
        
        first = True

        for df in dfs.itervalues():
            if first:
                df_final = df
                first = False
            else:
                df_final = concat([df_final, df], axis=1, join ="inner")
        
        return df_final