Exemple #1
0
    def test_ambiguous_warns(self):
        df = DataFrame({"A": [1, 2]})
        with tm.assert_produces_warning(FutureWarning):
            df.rename(id, id)

        with tm.assert_produces_warning(FutureWarning):
            df.rename({0: 10}, {"A": "B"})
Exemple #2
0
class Rename(object):

    def setup(self):
        N = 10**3
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.idx = np.arange(4 * N, 7 * N)
        self.dict_idx = {k: k for k in self.idx}
        self.df2 = DataFrame(
            {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
                 1: np.random.randint(0, N, N).astype(np.int16),
                 2: np.random.randint(0, N, N).astype(np.int32),
                 3: np.random.randint(0, N, N).astype(np.int64)}
                [np.random.randint(0, 4)] for c in range(N)})

    def time_rename_single(self):
        self.df.rename({0: 0})

    def time_rename_axis0(self):
        self.df.rename(self.dict_idx)

    def time_rename_axis1(self):
        self.df.rename(columns=self.dict_idx)

    def time_rename_both_axes(self):
        self.df.rename(index=self.dict_idx, columns=self.dict_idx)

    def time_dict_rename_both_axes(self):
        self.df.rename(index=self.dict_idx, columns=self.dict_idx)
Exemple #3
0
    def trialToOneRow(dfs, lonumberfields, oldnewsamsecnamepairs):
        collapsedFields = {field: [dfs[field].values]
                        for field in lonumberfields}
        collapsedTrial = DataFrame(collapsedFields)
        collapsedTrial.rename(columns=oldnewsamsecnamepairs, inplace=True)

        return collapsedTrial
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
def featureize(df, vectorizers, verbose=False):
    """
      Featurize an enhanced http dataframe

      Parameters
      ----------
      df : dataframe
          The enhanced HTTP log dataframe
      vectorizers : {String -> TfidfVectorizer}
            A map of feature -> vectorizer
      verbose: boolean, optional
          Controls Verbosity level

      Returns
      -------
      featureMatrix : numeric dataframe
            A featurized dataframe

    """
    if verbose: print('\nExtracting features')
    
    bow_features = []
    #featurize using the vectorizers.
    
    for feature in ['user_agent','uri','referrer','host', 'subdomain', 'method','status_code','resp_p_str', 'URIparams', 'browser_string', 'tld']:
        if verbose: print('Featurizing %s' % feature)
        single_feature_matrix = vectorizers[feature].transform(df[feature].astype(str))
        if verbose: print('  Dim of %s: %s' % (feature,single_feature_matrix.shape[1]))
        single_df = DataFrame(single_feature_matrix.toarray())
        single_df.rename(columns=lambda x: feature+"."+vectorizers[feature].get_feature_names()[x], inplace=True)
        bow_features.append(single_df)

    featureMatrix = pd.concat(bow_features, axis=1)
    
    #add some other numeric features that are functions of columns
    featureMatrix['domainNameLength'] = df['host'].apply(len)
    featureMatrix['domainNameDots'] = df['host'].apply(lambda dn: dn.count('.'))
    featureMatrix['uriSlashes'] = df['uri'].apply(lambda dn: dn.count('/'))
    featureMatrix['userAgentLength'] = df['user_agent'].apply(len)
    featureMatrix['userAgentEntropy'] = df['user_agent'].apply(H)
    featureMatrix['subdomainEntropy'] = df['subdomain'].apply(H)
    featureMatrix['request_body_len'] = df['request_body_len']
    featureMatrix['response_body_len'] = df['response_body_len']
    featureMatrix['referrerPresent'] = df['referrer'].apply(lambda r: 0.0 if (r=='-') else 1.0)
    
    def countParams(uri):
        fullUri = 'http://bogus.com/'+uri
        parseResult = parse_qs(urlparse(fullUri).query)
        return len(parseResult)
    
    featureMatrix['numURIParams'] = df['uri'].apply(countParams)
    featureMatrix['URIParamsKeyEntropy'] = df['URIparams'].apply(H)
    featureMatrix['URIParamsTokensEntropy'] = df['URItokens'].apply(H)

    if verbose: print('Feature matrix generated with %s columns' % featureMatrix.shape[1])

    return featureMatrix
def change_axis01():
    data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
    print data
    print data.index.map(str.upper)
    data.index=data.index.map(str.upper)
    print data
    print data.rename(index=str.title,columns=str.upper)
    print data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'},inplace=True)
    print data
def imputation_loyer(year):

    erf = create_comparable_erf_data_frame(year)
    erf = erf[['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident']]
    erf = erf.dropna(how = 'any')  # TODO : faire un check avant de dropper les lignes avec des NA

    Logt = create_comparable_logement_data_frame(year)

    Logt = Logt.dropna(how = 'any')

    allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci']
    classes = ['magtr', 'tu99_recoded']
    matchvars = list(set(allvars) - set(classes))

    for variable in allvars:
        count_NA(variable, Logt)
        count_NA(variable, erf)

    erf['mcs8'] = erf['mcs8'].astype(int)

    rpy2.robjects.pandas2ri.activate()  # Permet à rpy2 de convertir les dataframes   padas2ri doesn't exist anymore in rpy2
#    com.convert_to_r_dataframe() TODO: Probablement à supprimer
    try:
        sm = importr("StatMatch")  # Launch R you need to have StatMatch installed in R
    except:
        sm = importr("StatMatch", lib_loc = STATMATCH_LIB_LOCATION)
    out_nnd = sm.NND_hotdeck(data_rec = erf,
                             data_don = Logt,
                             match_vars = vectors.StrVector(matchvars),
                             don_class = vectors.StrVector(classes),
                             dist_fun = "Gower",
                             )
    fill_erf_nnd = sm.create_fused(data_rec = erf,
                                   data_don = Logt,
                                   mtc_ids = out_nnd[0],
                                   z_vars = vectors.StrVector(["lmlm"]),
                                   )
    del allvars, matchvars, classes, out_nnd
    gc.collect()

    fill_erf_nnd = com.convert_robj(fill_erf_nnd)
    fill_erf_nnd = DataFrame(fill_erf_nnd)
    fill_erf_nnd.rename(columns={'lmlm': 'loym'}, inplace = True)

    loy_imput = fill_erf_nnd[['ident', 'loym']]

    erfmenm = load_temp(name = "menagem", year = year)

    for var in ["loym", "loym_x", "loym_y", "loym_z"]:
        if var in erfmenm:
            del erfmenm[var]
            log.info("{} have been deleted".format(var))

    erfmenm = erfmenm.merge(loy_imput, on='ident', how='left')
    assert 'loym' in erfmenm.columns, u"La variable loym n'est pas présente dans erfmenm"
    save_temp(erfmenm, name = "menagem", year=year)
Exemple #8
0
    def test_rename_axis_style(self):
        # https://github.com/pandas-dev/pandas/issues/12392
        df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y'])
        expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y'])

        result = df.rename(str.lower, axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.rename(str.lower, axis='columns')
        tm.assert_frame_equal(result, expected)

        result = df.rename({"A": 'a', 'B': 'b'}, axis=1)
        tm.assert_frame_equal(result, expected)

        result = df.rename({"A": 'a', 'B': 'b'}, axis='columns')
        tm.assert_frame_equal(result, expected)

        # Index
        expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y'])
        result = df.rename(str.lower, axis=0)
        tm.assert_frame_equal(result, expected)

        result = df.rename(str.lower, axis='index')
        tm.assert_frame_equal(result, expected)

        result = df.rename({'X': 'x', 'Y': 'y'}, axis=0)
        tm.assert_frame_equal(result, expected)

        result = df.rename({'X': 'x', 'Y': 'y'}, axis='index')
        tm.assert_frame_equal(result, expected)

        result = df.rename(mapper=str.lower, axis='index')
        tm.assert_frame_equal(result, expected)
def slide_21():
    import json
    db = json.load(open(FOODJSONPATH))
    print len(db)

    print db[0].keys()
    print db[0]['nutrients'][0]

    nutrients = DataFrame(db[0]['nutrients'])
    print nutrients[:7]

    info_keys = ['description', 'group', 'id', 'manufacturer']
    info = DataFrame(db, columns=info_keys)
    print info[:5]

    print pd.value_counts(info.group)[:10]

    print "今から全部のnutrientsを扱うよ"
    nutrients = []

    for rec in db:
        fnuts = DataFrame(rec['nutrients'])
        fnuts['id'] = rec['id']
        nutrients.append(fnuts)

    nutrients = pd.concat(nutrients, ignore_index=True)
    print "なんか重複多い"
    print nutrients.duplicated().sum()
    nutrients = nutrients.drop_duplicates()

    print "infoとnutrients両方にdescriptionとgroupがあるから変えよう"
    col_mapping = {'description': 'food', 'group': 'fgroup'}
    info = info.rename(columns=col_mapping, copy=False)

    col_mapping = {'description': 'nutrient', 'group': 'nutgroup'}
    nutrients = nutrients.rename(columns=col_mapping, copy=False)

    ndata = pd.merge(nutrients, info, on='id', how='outer')
    print ndata.ix[30000]

    result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
    result['Zinc, Zn'].order().plot(kind='barh')
    plt.show()

    by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])
    get_maximum = lambda x: x.xs(x.value.idxmax())
    get_minimum = lambda x: x.xs(x.value.idxmin())

    max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]

    max_foods.food = max_foods.food.str[:50]

    print max_foods.ix['Amino Acids']['food']
Exemple #10
0
def append_village_areas(divname):
    im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower())
    shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()),
                               lat_offset, lon_offset)
    areas = shape_helper.get_shape_areas('village')
    areas_df = DataFrame(areas, index=['area'])
    areas_df = areas_df.transpose()
    areas_df.reset_index(inplace=True)
    areas_df.rename(columns={'index': 'village'}, inplace=True)
    im_vil_areas = pd.merge(im_vil, areas_df, how='left')
    im_vil_areas.set_index('image', inplace=True)
    im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
    def obs_for_station_as_df(self, station_id, datatype, limit):
    	obs_results = self.obs_for_station_q(station_id, datatype, limit)
    	if not FbStationsApi.check_ok(obs_results, "obs_for_station_q"):
    		return None

    	obs_data = obs_results.json()['results']
    	if bool(obs_data):
    		obs_results_df = DataFrame(obs_data)
    		obs_results_df.rename(columns = {'value' : datatype}, inplace=True)
    		obs_results_df['date'] = obs_results_df['date'].astype('datetime64[ns]')
    		return obs_results_df
    	else:
    		return None
Exemple #12
0
 def fix_tickets(
         self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame:
     ticket_frame.rename(
         columns={'Total changed lines': 'ChangedLines'}, inplace=True)
     ticket_frame = ticket_frame[
         ticket_frame.ChangedLines < 100000]
     ticket_frame = ticket_frame.assign(
         ChangedFiles=ticket_frame['Changed files'].apply(
         partial(self.fix_path_prefixes, path_fixes)))
     fixed_frame = ticket_frame.drop(
         'Changed files', axis=1).sort_values(
         by='CommitDate').reset_index(drop=True)
     fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True)
     return fixed_frame
    def forecast_as_df(self, lat, lon, datatype, limit):
    	forecast_results = self.forecast_q(lat, lon, limit)
    	if not FbForecastApi.check_ok(forecast_results, "forecast_q"):
    		return None

    	forecast_data = forecast_results.json()['results']
    	if bool(forecast_data):
    		forecast_results_df = DataFrame(forecast_data)
    		forecast_results_df.rename(columns = {'value' : datatype}, inplace=True)
    		forecast_results_df['forecast_date'] = forecast_results_df['forecast_date'].astype('datetime64[ns]')
    		forecast_results_df['model_date'] = forecast_results_df['model_date'].astype('datetime64[ns]')
    		return forecast_results_df
    	else:
    		return None
def get_cpu_sw_map(dfds, cap_time_usec, task_re):
    df_list = []
    dfsw_list = []
    for dfd in dfds:
        df = filter_df_core(dfd.df, task_re, True)
        # at this point we have a set of df that look like this:
        #         task_name  duration
        # 0     ASA.1.vcpu0      7954
        # 1     ASA.1.vcpu0      5475
        # 2     ASA.1.vcpu0      4151
        if df.empty:
            continue
        gb = df.groupby("task_name", as_index=False)

        # sum all duration for each task
        df = gb.aggregate(np.sum)
        if dfd.multiplier > 1.0:
            df["duration"] = (df["duration"] * dfd.multiplier).astype(int)
        df["percent"] = ((df["duration"] * 100 * 10) // cap_time_usec) / 10
        if len(dfds) > 1:
            df["task_name"] = df["task_name"] + "." + dfd.short_name
        df_list.append(df)

        # count number of rows with same task and cpu
        dfsw = DataFrame(gb.size())
        dfsw.reset_index(inplace=True)
        dfsw.rename(columns={0: "count"}, inplace=True)

        if dfd.multiplier > 1.0:
            dfsw["count"] = (dfsw["count"] * dfd.multiplier).astype(int)
        else:
            dfsw["count"] = dfsw["count"].astype(int)
        dfsw_list.append(dfsw)

    if not df_list:
        return None

    df = pandas.concat(df_list)
    df = df.drop("duration", axis=1)
    dfsw = pandas.concat(dfsw_list)
    df = pandas.merge(df, dfsw, on="task_name")
    # Result:
    #             task_name  percent  count
    # 0  ASA.01.vcpu0.1x218     72.0  1998
    # 1  ASA.01.vcpu0.2x208     61.8  2128
    # 2  ASA.02.vcpu0.2x208     58.9  2177

    # transform this into a dict where the key is the task_name and the value
    # is a list [percent, count]
    return df.set_index("task_name").T.to_dict("list")
Exemple #15
0
    def test_rename(self):
        mapping = {
            'A': 'a',
            'B': 'b',
            'C': 'c',
            'D': 'd'
        }

        renamed = self.frame.rename(columns=mapping)
        renamed2 = self.frame.rename(columns=str.lower)

        assert_frame_equal(renamed, renamed2)
        assert_frame_equal(renamed2.rename(columns=str.upper),
                           self.frame, check_names=False)

        # index
        data = {
            'A': {'foo': 0, 'bar': 1}
        }

        # gets sorted alphabetical
        df = DataFrame(data)
        renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'})
        tm.assert_index_equal(renamed.index, pd.Index(['foo', 'bar']))

        renamed = df.rename(index=str.upper)
        tm.assert_index_equal(renamed.index, pd.Index(['BAR', 'FOO']))

        # have to pass something
        pytest.raises(TypeError, self.frame.rename)

        # partial columns
        renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'})
        tm.assert_index_equal(renamed.columns,
                              pd.Index(['A', 'B', 'foo', 'bar']))

        # other axis
        renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'})
        tm.assert_index_equal(renamed.index,
                              pd.Index(['A', 'B', 'foo', 'bar']))

        # index with name
        index = Index(['foo', 'bar'], name='name')
        renamer = DataFrame(data, index=index)
        renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
        tm.assert_index_equal(renamed.index,
                              pd.Index(['bar', 'foo'], name='name'))
        assert renamed.index.name == renamer.index.name
Exemple #16
0
    def test_convert_dummies(self):
        df = DataFrame(
            {
                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
                "C": np.random.randn(8),
                "D": np.random.randn(8),
            }
        )

        with tm.assert_produces_warning(FutureWarning):
            result = convert_dummies(df, ["A", "B"])
            result2 = convert_dummies(df, ["A", "B"], prefix_sep=".")

        expected = DataFrame(
            {
                "A_foo": [1, 0, 1, 0, 1, 0, 1, 1],
                "A_bar": [0, 1, 0, 1, 0, 1, 0, 0],
                "B_one": [1, 1, 0, 0, 0, 0, 1, 0],
                "B_two": [0, 0, 1, 0, 1, 1, 0, 0],
                "B_three": [0, 0, 0, 1, 0, 0, 0, 1],
                "C": df["C"].values,
                "D": df["D"].values,
            },
            columns=result.columns,
            dtype=float,
        )
        expected2 = expected.rename(columns=lambda x: x.replace("_", "."))

        tm.assert_frame_equal(result, expected)
        tm.assert_frame_equal(result2, expected2)
Exemple #17
0
def merge_by_year(year):
	"merge the two dataframe based on Country"

	data1 = Df(transIncome.ix[year])
	data1 = data1.rename(columns = {year:'Income'})
	data1['Country'] = transIncome.columns
	mergedData = pd.merge(data1,countries,on = ['Country'])
	return mergedData
Exemple #18
0
    def group_by_time_transform(self):
        dp_df_columns = ['data_date','indicator_id','location_id','value']
        time_grouping =  self.parsed_params['group_by_time']

        # HACKK
        if self.parsed_params['chart_uuid'] ==\
            '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be':

            self.parsed_params['show_missing_data'] = 1
            return self.handle_polio_case_table(dp_df_columns)

        cols = ['data_date','indicator_id','location_id','value']
        dp_df = DataFrame(list(DataPoint.objects.filter(
            location_id__in = self.location_ids,
            indicator_id__in = self.parsed_params['indicator__in']
        ).values(*cols)),columns=cols)

        if not dp_df.empty:
            dp_df = self.get_time_group_series(dp_df)
            gb_df = DataFrame(dp_df\
                .groupby(['indicator_id','time_grouping','location_id'])['value']\
                .sum())\
                .reset_index()
            return gb_df
        # need to look at sublocations if the data isn't available at the current level
        else:
            depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids
            while dp_df.empty and depth_level < max_depth:
                sub_location_ids = Location.objects\
                    .filter(parent_location_id__in=sub_location_ids)\
                    .values_list('id', flat=True)

                dp_df = DataFrame(list(DataPoint.objects.filter(
                    location_id__in = sub_location_ids,
                    indicator_id__in = self.parsed_params['indicator__in']
                ).values(*cols)),columns=cols)
                depth_level += 1

            dp_df = self.get_time_group_series(dp_df)
            if dp_df.empty:
                return []
            location_tree_df = DataFrame(list(LocationTree.objects\
                .filter(location_id__in = sub_location_ids)\
                .values_list('location_id','parent_location_id')),\
                    columns=['location_id','parent_location_id'])

            merged_df = dp_df.merge(location_tree_df)
            filtered_df = merged_df[merged_df['parent_location_id']\
                .isin(self.location_ids)]

            gb_df = DataFrame(filtered_df\
                .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\
                .sum())\
                .reset_index()

            gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'})
            return gb_df
def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust):
    '''
    Derives intelligence out of MTC1 data 
    on customer returns. 
    '''
    print('*'*100)
    print('Creating summary of returns.')
    print('*'*100)
    
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    print('\n\n\nAggregating tidy dataset.')
    customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)
    
    print('Reorder columns for readability.\n\n\n')
    reorder_cols = ['CustomerId','Customer','Returns|count',
                    'PercentSales','DollarSales|bycustomer',
                    'DollarsReturned|sum','DollarsReturned|avg',
                    'CasesReturned|sum','CasesReturned|avg',
                    'OnPremise','Latitude','Longitude']
    customer_returns = customer_returns[reorder_cols]
    
    print('*'*100)
    print('Finished summarizing returns.')
    print('*'*100)
    
    return customer_returns
Exemple #20
0
    def test_frame_describe_tupleindex(self):

        # GH 14848 - regression from 0.19.0 to 0.19.1
        df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
                         'y': [10, 20, 30, 40, 50] * 3,
                         'z': [100, 200, 300, 400, 500] * 3})
        df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
        df2 = df1.rename(columns={'k': 'key'})
        pytest.raises(ValueError, lambda: df1.groupby('k').describe())
        pytest.raises(ValueError, lambda: df2.groupby('key').describe())
Exemple #21
0
 def test_rename_positional(self):
     df = DataFrame(columns=['A', 'B'])
     with tm.assert_produces_warning(FutureWarning) as rec:
         result = df.rename(None, str.lower)
     expected = DataFrame(columns=['a', 'b'])
     tm.assert_frame_equal(result, expected)
     assert len(rec) == 1
     message = str(rec[0].message)
     assert 'rename' in message
     assert 'Use named arguments' in message
    def test_insert_column_bug_4032(self):

        # GH4032, inserting a column and renaming causing errors
        df = DataFrame({'b': [1.1, 2.2]})
        df = df.rename(columns={})
        df.insert(0, 'a', [1, 2])

        result = df.rename(columns={})
        str(result)
        expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
        assert_frame_equal(result, expected)
        df.insert(0, 'c', [1.3, 2.3])

        result = df.rename(columns={})
        str(result)

        expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
                             columns=['c', 'a', 'b'])
        assert_frame_equal(result, expected)
Exemple #23
0
    def test_rename_bug2(self):
        # GH 19497
        # rename was changing Index to MultiIndex if Index contained tuples

        df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)],
                       columns=["a"])
        df = df.rename({(1, 1): (5, 4)}, axis="index")
        expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)],
                             columns=["a"])
        assert_frame_equal(df, expected)
def nearestNeighborsSetup(filename, stateList):
  df_specimens = formatChecker(filename)
  print 'Getting the weather stations'
  with open('input/acis_station_ID.pickle') as f:
      weatherStationsMetaData = cPickle.load(f)

  # weatherStationsMetaData = weatherStations(stateList)
  # weatherStationsMetaData = read_csv('weatherStation/acis_station_ID.csv')
  df_stations = DataFrame.from_dict(weatherStationsMetaData, orient='index', dtype=None)
  '''Loads the lat/long coordinates of the specimens and weather stations into numpy arrays.
  NearestNeighborsResults() will return he number of K (nearest stations) with the index value.
  Then index will be replaced by the UID to match the ASIC data serve.'''
	#Number of points
  np1 = np.array(df_specimens['longitude']).size
  np2 = np.array(df_stations['longitude']).size

  #Search radius
  r = .25

  #Number of nearest stations returned
  k = 10

  d1 = np.empty((np1, 2))
  d2 = np.empty((np2, 2))
  d1[:, 0] = np.array(df_specimens['latitude'])
  d1[:, 1] = np.array(df_specimens['longitude'])

  d2[:, 0] = np.array(df_stations['latitude'])
  d2[:, 1] = np.array(df_stations['longitude'])
 
  result, distance = nearestNeighborsResults(d1.copy(), d2.copy(), r, k)
  columnindex = []
  closestStationList = [nearestNeighborsColumnString(x) for x in range(k)]
  for f in closestStationList: columnindex.append(f()),
  #temp variable for 0-N array
  t1 = np.arange(np2)
  #temp variable for 'uid' ID
  t2 = np.array(df_stations['uid'])
  df_results = DataFrame(result, columns=columnindex)
  #Creates a Pandas DataFrame
  uid_index = DataFrame({'0_closest_weather_station':  t1,
    'uid': t2})

  for index, column_name in enumerate(columnindex):
    temp = uid_index.rename(columns={'0_closest_weather_station': column_name, 'uid': column_name + "s"})
    df_results = df_results.reset_index().merge(temp, how='left', on= column_name, sort=False).sort('index')
    
    if index != 0:
      del df_results['level_0']

    del df_results[column_name]

  del df_results['index']
  df_results = df_results.reset_index()
  return concat([df_specimens, df_results], axis=1), distance, weatherStationsMetaData
def deal_string02():
    import json
    db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json'))
    print len(db)
    print db[0]
    print db[0].keys()
    print db[0]['nutrients'][0]
    nutrients=DataFrame(db[0]['nutrients'])
    print nutrients[:7]
    info_keys=['description','group','id','manufacturer']
    info=DataFrame(db,columns=info_keys)
    print info[:5]
    print pd.value_counts(info.group)[:10]

    nutrients=[]
    for rec in db:
        fnuts=DataFrame(rec['nutrients'])
        fnuts['id']=rec['id']
        nutrients.append(fnuts)
    nutrients=pd.concat(nutrients,ignore_index=True)
    print nutrients
    print nutrients.duplicated().sum()
    nutrients=nutrients.drop_duplicates()
    col_mapping={'description':'food','group':'fgroup'}
    info=info.rename(columns=col_mapping,copy=False)
    print info
    col_mapping={'description':'nutrient','group':'nutgroup'}
    nutrients=nutrients.rename(columns=col_mapping,copy=False)
    print nutrients
    ndata=pd.merge(nutrients,info,on='id',how='outer')
    print ndata
    print ndata.ix[3000]
    result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
    # print result
    result['Zinc, Zn'].sort_values().plot(kind='barh')
    by_nutrient=ndata.groupby(['nutgroup','nutrient'])
    get_maximum=lambda x:x.xs(x.value.idxmax())
    get_minimum=lambda x:x.xs(x.value.idmin())
    max_foods=by_nutrient.apply(get_maximum)[['value','food']]
    max_foods.food=max_foods.food.str[:50]
    print max_foods.ix['Amino Acids']['food']
Exemple #26
0
def get_plate_data(path,c):
    """ Get plate data, drop empty columns, drop selected columns, 
        rename columns, add normalized columns. """
    return thread_first(path,
                        from_file,
                        (str.replace,'\r',''),
                        StringIO,
                        pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']),
                        df.dropna(axis=1,how='all'),
                        (drop_matching_columns,c['dropcols']),
                        df.rename(columns=c['colrename']),
                        (add_normalized_columns,c['normcols']))
    def parse_data(self, articles):
        """ Responsible to parse articles in order to extract data.
        Data is extracted as a DataFrame containing the following columns:
        - Article metadata: only the metadata defined in self.metadata_column are extracted
        - Article tags: all tags are extracted, the name defined in self.tags_column are used to rename columns
        Data is indexed by a generated ID (integer).

        :param articles: The articles to parse.
        """
        tags = []
        metadata = []
        # TODO not the more efficient way to do that I think.
        for article in articles:
            if hasattr(article, "tags"):
                # Extracting all tags name from an article and putting them in a Series
                tags.append(
                    Series([tag.name for tag in article.tags], ["tag_" + str(x) for x in range(len(article.tags))])
                )
            # Selecting metadata, only the ones specified in the columns
            metadata.append(
                Series(
                    dict([(i, article.metadata[i]) for i in self.metadata_columns if i in article.metadata]),
                    self.metadata_columns,
                )
            )
        # Creating the tags DataFrame
        tags_data_frame = DataFrame(tags)
        # Renaming columns, leaving the remaining ones with the generated name "tag_"
        # Mapping current column names to the new ones in order to make a replacement
        if self.tag_columns is not None:
            replacement = dict(zip(tags_data_frame.columns.get_values()[: len(self.tag_columns)], self.tag_columns))
            # Inplace means no copy
            tags_data_frame.rename(columns=replacement, inplace=True)
        # Creating the metadata DataFrame
        metadata_data_frame = DataFrame(metadata)
        # Replacing data in column category by its string value
        # TODO maybe a better way to do that, it seems a bit ugly
        metadata_data_frame["category"] = metadata_data_frame["category"].apply(lambda x: str(x))
        # Merging the two DataFrame together
        self.data = metadata_data_frame.join(tags_data_frame)
Exemple #28
0
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)
Exemple #29
0
def test_frame_describe_tupleindex():

    # GH 14848 - regression from 0.19.0 to 0.19.1
    df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3,
                     'y': [10, 20, 30, 40, 50] * 3,
                     'z': [100, 200, 300, 400, 500] * 3})
    df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
    df2 = df1.rename(columns={'k': 'key'})
    msg = "Names should be list-like for a MultiIndex"
    with pytest.raises(ValueError, match=msg):
        df1.groupby('k').describe()
    with pytest.raises(ValueError, match=msg):
        df2.groupby('key').describe()
Exemple #30
0
    def get_results_dataframe(self, default = False, index_by_code = False):
        '''
        Formats data into a dataframe
        '''
        datas = self._compute()
        self._compute_uc()
        uc = self.uc
        dfs = dict()

        for scenario, dico in datas.iteritems():
            data = dico['data']
            data_default = dico['default']

            data_dict = dict()
            index = []
            
            if default is True:
                data = data_default
            
            for row in data:
                if not row.desc in ('root'):
                    if row.code == 'revdisp':
                        revdisp = row.vals
                    if index_by_code is True:
                        index.append(row.code)
                        data_dict[row.code] = row.vals
                    else:
                        index.append(row.desc)
                        data_dict[row.desc] = row.vals
            
            df = DataFrame(data_dict).T
            
            df = df.reindex(index)
            df = df.rename(columns = {0: scenario})
            nivvie = revdisp/uc[scenario] # TODO: include savings !!
            df = concat([df, 
                         DataFrame({scenario: nivvie}, index=['nivvie'])
                         ])
            dfs[scenario] = df
            
        
        first = True

        for df in dfs.itervalues():
            if first:
                df_final = df
                first = False
            else:
                df_final = concat([df_final, df], axis=1, join ="inner")
        
        return df_final
Exemple #31
0
print(df)

# max([1,2,10])
# df['score2'] = df['C'].map(max)  # 运行这个会报错!

# transform()和map()使用方法是一样的---了解即可上面是关键
df['score2'] = df['C'].transform(mp)
print(df)

df['C'] = df['C'].map(lambda x: x * 2)  # dataframe更新列‘C’,后面是隐射条件---匿名函数
print(df)

# 2.3  rename()函数:替换行索引
# 2.3.1 第一次改动
inds = {'张三': 'Zhang Sir', '木兰': 'MissLan'}  # 设置键值对用于替换索引
df.rename(index=inds)


# 2.3.2 替换列索引
def cols(x):
    if x == 'PHP':
        return 'php'
    if x == 'Python':
        return '大蟒蛇'
    else:
        return x


inds = {'张三': 'Zhang Sir', '木兰': 'MissLan'}
df.rename(index=inds, columns=cols,
          inplace=True)  # index = inds更改索引名称,columns = cols更改列名称
    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params["symbol"]
        del params["symbol"]
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);"
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
        except KeyError:
            msg = "No data fetched for symbol {} using {}"
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data["prices"])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date)

        if "Data" in prices.columns:
            prices = prices[prices["Data"].isnull()]
        prices = prices[["Date", "High", "Low", "Open", "Close", "Volume", "Adjclose"]]
        prices = prices.rename(columns={"Adjclose": "Adj Close"})

        prices = prices.set_index("Date")
        prices = prices.sort_index().dropna(how="all")

        if self.ret_index:
            prices["Ret_Index"] = _calc_return_index(prices["Adj Close"])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data["eventsData"]:

            actions = DataFrame(data["eventsData"])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions["Date"] = to_datetime(
                to_datetime(actions["Date"], unit="s").dt.date
            )

            types = actions["Type"].unique()
            if "DIVIDEND" in types:
                divs = actions[actions.Type == "DIVIDEND"].copy()
                divs = divs[["Date", "Amount"]].reset_index(drop=True)
                divs = divs.set_index("Date")
                divs = divs.rename(columns={"Amount": "Dividends"})
                prices = prices.join(divs, how="outer")

            if "SPLIT" in types:

                def split_ratio(row):
                    if float(row["Numerator"]) > 0:
                        return eval(row["Splitratio"])
                    else:
                        return 1

                splits = actions[actions.Type == "SPLIT"].copy()
                splits["SplitRatio"] = splits.apply(split_ratio, axis=1)
                splits = splits.reset_index(drop=True)
                splits = splits.set_index("Date")
                splits["Splits"] = splits["SplitRatio"]
                prices = prices.join(splits["Splits"], how="outer")

                if "DIVIDEND" in types and not self.adjust_dividends:
                    # dividends are adjusted automatically by Yahoo
                    adj = (
                        prices["Splits"].sort_index(ascending=False).fillna(1).cumprod()
                    )
                    prices["Dividends"] = prices["Dividends"] / adj

        return prices
Exemple #33
0
    def test_rename_multiindex(self):

        tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
        tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
        index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
        columns = MultiIndex.from_tuples(tuples_columns,
                                         names=['fizz', 'buzz'])
        df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns)

        #
        # without specifying level -> accross all levels

        renamed = df.rename(index={
            'foo1': 'foo3',
            'bar2': 'bar3'
        },
                            columns={
                                'fizz1': 'fizz3',
                                'buzz2': 'buzz3'
                            })
        new_index = MultiIndex.from_tuples([('foo3', 'bar1'),
                                            ('foo2', 'bar3')],
                                           names=['foo', 'bar'])
        new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'),
                                              ('fizz2', 'buzz3')],
                                             names=['fizz', 'buzz'])
        tm.assert_index_equal(renamed.index, new_index)
        tm.assert_index_equal(renamed.columns, new_columns)
        assert renamed.index.names == df.index.names
        assert renamed.columns.names == df.columns.names

        #
        # with specifying a level (GH13766)

        # dict
        new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'),
                                              ('fizz2', 'buzz2')],
                                             names=['fizz', 'buzz'])
        renamed = df.rename(columns={
            'fizz1': 'fizz3',
            'buzz2': 'buzz3'
        },
                            level=0)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns={
            'fizz1': 'fizz3',
            'buzz2': 'buzz3'
        },
                            level='fizz')
        tm.assert_index_equal(renamed.columns, new_columns)

        new_columns = MultiIndex.from_tuples([('fizz1', 'buzz1'),
                                              ('fizz2', 'buzz3')],
                                             names=['fizz', 'buzz'])
        renamed = df.rename(columns={
            'fizz1': 'fizz3',
            'buzz2': 'buzz3'
        },
                            level=1)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns={
            'fizz1': 'fizz3',
            'buzz2': 'buzz3'
        },
                            level='buzz')
        tm.assert_index_equal(renamed.columns, new_columns)

        # function
        func = str.upper
        new_columns = MultiIndex.from_tuples([('FIZZ1', 'buzz1'),
                                              ('FIZZ2', 'buzz2')],
                                             names=['fizz', 'buzz'])
        renamed = df.rename(columns=func, level=0)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns=func, level='fizz')
        tm.assert_index_equal(renamed.columns, new_columns)

        new_columns = MultiIndex.from_tuples([('fizz1', 'BUZZ1'),
                                              ('fizz2', 'BUZZ2')],
                                             names=['fizz', 'buzz'])
        renamed = df.rename(columns=func, level=1)
        tm.assert_index_equal(renamed.columns, new_columns)
        renamed = df.rename(columns=func, level='buzz')
        tm.assert_index_equal(renamed.columns, new_columns)

        # index
        new_index = MultiIndex.from_tuples([('foo3', 'bar1'),
                                            ('foo2', 'bar2')],
                                           names=['foo', 'bar'])
        renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0)
        tm.assert_index_equal(renamed.index, new_index)
Exemple #34
0
# 데이터 전처리
# '년도'에 대한 컬럼만 리스트로 추출
year = list(df['year'])
# print_df(year)

# 빈 딕셔너리 생성
new_name = {}

# '년도' 리스트에 대해 반복
for i, v in enumerate(year):
    new_name[i] = v
# print(new_name)

# 데이터 프레임의 인덱스 변경
df.rename(index=new_name, inplace=True)

# 기존의 '년도' 컬럼은 삭제
df.drop('year', axis=1, inplace=True)

# 컬럼명 변경
df.rename(columns={
    'car_vs_people': '차 대 사람',
    'car_vs_car': '차 대 차',
    'car_only': '차량 단독',
},
          inplace=True)

print_df(df)

# 전역 설정
Exemple #35
0
def main(cfg: DictConfig) -> None:
    print(cfg)
    logger.info(f"The current working directory is {Path().cwd()}")
    start_time = time.time()
    logger.info("initializing experimental condition..")

    # compared ope estimators
    lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"])
    ope_estimators = [
        DoublyRobustWithShrinkage(lambda_=lam_,
                                  estimator_name=f"DRos ({lam_})")
        for lam_ in lambdas
    ] + [
        DoublyRobustWithShrinkageTuning(lambdas=lambdas,
                                        estimator_name="DRos (tuning)"),
    ]

    # configurations
    n_seeds = cfg.setting.n_seeds
    sample_size = cfg.setting.sample_size
    reg_model = cfg.setting.reg_model
    campaign = cfg.setting.campaign
    behavior_policy = cfg.setting.behavior_policy
    test_size = cfg.setting.test_size
    is_timeseries_split = cfg.setting.is_timeseries_split
    n_folds = cfg.setting.n_folds
    obd_path = (Path().cwd().parents[5] /
                "open_bandit_dataset" if cfg.setting.is_full_obd else None)
    random_state = cfg.setting.random_state
    np.random.seed(random_state)

    # define dataset
    dataset_ts = OpenBanditDataset(behavior_policy="bts",
                                   campaign=campaign,
                                   data_path=obd_path)
    dataset_ur = OpenBanditDataset(behavior_policy="random",
                                   campaign=campaign,
                                   data_path=obd_path)

    # prepare logged bandit feedback and evaluation policies
    if behavior_policy == "random":
        if is_timeseries_split:
            bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
            )[0]
        else:
            bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback()
        bandit_feedbacks = [bandit_feedback_ur]
        # obtain the ground-truth policy value
        ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate(
            behavior_policy="bts",
            campaign=campaign,
            data_path=obd_path,
            test_size=test_size,
            is_timeseries_split=is_timeseries_split,
        )
        # obtain action choice probabilities and define evaluation policies
        policy_ts = BernoulliTS(
            n_actions=dataset_ts.n_actions,
            len_list=dataset_ts.len_list,
            random_state=random_state,
            is_zozotown_prior=True,
            campaign=campaign,
        )
        action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000)
        evaluation_policies = [(ground_truth_ts, action_dist_ts)]
    else:
        if is_timeseries_split:
            bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback(
                test_size=test_size,
                is_timeseries_split=True,
            )[0]
        else:
            bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback()
        bandit_feedbacks = [bandit_feedback_ts]
        # obtain the ground-truth policy value
        ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate(
            behavior_policy="random",
            campaign=campaign,
            data_path=obd_path,
            test_size=test_size,
            is_timeseries_split=is_timeseries_split,
        )
        # obtain action choice probabilities and define evaluation policies
        policy_ur = Random(
            n_actions=dataset_ur.n_actions,
            len_list=dataset_ur.len_list,
            random_state=random_state,
        )
        action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000)
        evaluation_policies = [(ground_truth_ur, action_dist_ur)]

    # regression models used in ope estimators
    hyperparams = dict(cfg.reg_model_hyperparams)[reg_model]
    regression_models = [reg_model_dict[reg_model](**hyperparams)]

    # define an evaluator class
    evaluator = InterpretableOPEEvaluator(
        random_states=np.arange(n_seeds),
        bandit_feedbacks=bandit_feedbacks,
        evaluation_policies=evaluation_policies,
        ope_estimators=ope_estimators,
        regression_models=regression_models,
    )

    # conduct an evaluation of OPE experiment
    logger.info("experiment started")
    _ = evaluator.estimate_policy_value(sample_size=sample_size,
                                        n_folds_=n_folds)
    # calculate statistics
    mean = evaluator.calculate_mean(root=True)
    mean_scaled = evaluator.calculate_mean(scale=True, root=True)

    # save results of the evaluation of off-policy estimators
    log_path = Path("./outputs/hypara")
    log_path.mkdir(exist_ok=True, parents=True)
    # save root mse
    root_mse_df = DataFrame()
    root_mse_df["estimator"] = list(mean.keys())
    root_mse_df["mean"] = list(mean.values())
    root_mse_df["mean(scaled)"] = list(mean_scaled.values())
    root_mse_df.to_csv(log_path / "root_mse.csv")
    # conduct pairwise t-tests
    se_df = DataFrame(evaluator.calculate_squared_error())
    se_df = DataFrame(se_df.stack()).reset_index(1)
    se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True)
    nonparam_ttests = (pg.pairwise_ttests(
        data=se_df,
        dv="se",
        parametric=False,
        between="estimators",
    ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1))
    nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv")
    # print result
    print(root_mse_df)
    experiment = f"{campaign}-{behavior_policy}-{sample_size}"
    elapsed_time = np.round((time.time() - start_time) / 60, 2)
    logger.info(f"finish experiment {experiment} in {elapsed_time}min")
Exemple #36
0
model = vecm.select_order(train_ecm, maxlags=8)
print(model.summary())

# In[10]:

pd.options.display.float_format = "{:.2f}".format
"""definition of det_orderint:
-1 - no deterministic terms; 0 - constant term; 1 - linear trend"""
pd.options.display.float_format = "{:.2f}".format
model = coint_johansen(endog=train_ecm, det_order=1, k_ar_diff=3)
print('Eigen statistic:')
print(model.eig)
print()
print('Critical values:')
d = DataFrame(model.cvt)
d.rename(columns={0: '90%', 1: '95%', 2: '99%'}, inplace=True)
print(d)
print()
print('Trace statistic:')
print(DataFrame(model.lr1))

# Here, we see that trace statistics (76.86033757  44.90556245  24.43779121  11.11437692) are < critical values @95% (79.24, 55.24, 35.01, 18.39).
#
# Interpreting Johansen Cointegration Test Results
# - output releases two statistics, Trace Statistic and Max-Eigen Statistic
# - Rejection criteria is at 0.05 level
# - Reject (H0) null hypothesis if the p-value <= 0.05 means there is no cointegrating equations.

# ### Eigen and Trace statistic (Johansent co-integration)

# In[10]:
#Palabras=Palabras[(Palabras['IDIOMA']=='en')&(Palabras['TIPO']=='POSITIVO')][['OPERATION_NUMBER','WORDS']] #Esta versión arroja nube de palabras incompleta
Palabras = Palabras[(Palabras['TIPO'] == 'POSITIVO') |
                    (Palabras['TIPO'] == 'NEUTRO POSITIVO')][[
                        'OPERATION_NUMBER', 'WORDS', 'TIPO'
                    ]]

Palabras["WORDS2"] = Palabras["WORDS"].apply(singular)
Palabras = Palabras[["OPERATION_NUMBER", "WORDS2", "TIPO"]]
Palabras.rename(columns={'WORDS2': 'WORDS'}, inplace=True)

#Palabras=DataFrame(Palabras["PALABRAS","WORDS"].groupby([Palabras['OPERATION_NUMBER']],Palabras['WORDS','PALABRAS']).count()) #Esta línea no corre, lo puse como está en la versión de EDU_IADB_cartera_digital que si corre
#Palabras=DataFrame(Palabras["WORDS"].groupby([Palabras['OPERATION_NUMBER'],Palabras['WORDS']]).count())
Palabras = DataFrame(Palabras['WORDS'].groupby(
    [Palabras['OPERATION_NUMBER'], Palabras['WORDS'],
     Palabras['TIPO']]).count())
Palabras.rename(columns={'WORDS': 'COUNT_WORDS'}, inplace=True)
Palabras.rename(columns={'PALABRAS': 'COUNT_WORDS'}, inplace=True)
Palabras.reset_index(inplace=True)

######### Juntar archivo de pipeline con revisión de texto para ver cuáles salieron como dig #######

Base_pipe = Metadatos_pipe[[
    'OPERATION_NUMBER', 'OPERATION_NAME', 'PIPE_YR', 'OPERATION_TYPE',
    'OPERATION_TYPE_NAME', 'OPERATION_MODALITY', 'TAXONOMY', 'STATUS',
    'REGION', 'COUNTRY', 'DEPARTMENT', 'DIVISION', 'TEAM_LEADER_NM',
    'APPROVAL_DATE', 'APPROVAL_AMOUNT', 'CURRENT_EXPIRATION_DATE'
]]

oper_proc = Bas[[
    'OPERATION_NUMBER', 'DUMMY_DIGITAL', 'DUMMY_OBJETIVO_DIG',
    'DUMMY_OUTPUT_DIG', 'DIG_OUTPUT_DESCRIPTION'
Exemple #38
0
 def test_rename_mi(self):
     df = DataFrame([11, 21, 31],
                    index=MultiIndex.from_tuples([
                        ("A", x) for x in ["a", "B", "c"]
                    ]))
     result = df.rename(str.lower)
Exemple #39
0
def flows(futures, start=None, end=None, var=None, roll=None):
    position = futures.position
    market = futures.market
    # market1 = futures.p
    market = DataFrame(
        list(market.find({
            'date': {
                '$gte': start
            },
            'variety': var
        })))
    position = DataFrame(
        list(position.find({
            'date': {
                '$gte': start
            },
            'variety': var
        }))).drop_duplicates(['date', 'variety', 'symbol', 'long_party_name'],
                             'last')
    # position = position[['date','varie']]
    # position = position[position['long_party_name'].notna()]

    # 持仓
    # 所有会员
    party_name = position[position['date'] == end]
    long_party_name = party_name['long_party_name']
    short_party_name = party_name['short_party_name']
    party_name = long_party_name.append(
        short_party_name).dropna().drop_duplicates()
    # 多空变化量求和
    long = position.groupby(['date', 'variety', 'long_party_name'
                             ])[['long_openIntr', 'long_openIntr_chg']].sum()
    # print(long)
    short = position.groupby(['date', 'variety', 'short_party_name'
                              ])[['short_openIntr',
                                  'short_openIntr_chg']].sum()
    # # 合并
    frames = [long, short]
    position = pd.concat(frames, axis=1, sort=True).fillna(0).reset_index()
    # 字段更名
    position = position.rename(columns={
        'level_0': 'date',
        'level_1': 'variety',
        'level_2': 'BrokerID'
    })
    #
    ##行情
    market = market.copy()
    # 指数收盘
    market['cv'] = market.apply(lambda x: x['close'] * x['open_interest'],
                                axis=1)
    closes = market.groupby(['date', 'variety'])[['cv', 'open_interest']].sum()
    closes['close_index'] = closes['cv'] / closes['open_interest']
    # #指数开盘
    market['ov'] = market.apply(lambda x: x['open'] * x['open_interest'],
                                axis=1)
    opens = market.groupby(['date', 'variety'])[['ov', 'open_interest']].sum()
    closes['open_index'] = opens['ov'] / opens['open_interest']
    # 价格变化量
    closes['change_index'] = closes.apply(
        lambda x: x['close_index'] - x['open_index'], axis=1)
    closes = closes.reset_index()

    chg = closes[['date', 'variety', 'close_index', 'change_index']]

    # print(chg['change_index'])

    # print(merge)
    df = pd.DataFrame()

    for i in party_name:
        try:
            chg = chg.copy()
            # print(chg)
            chg['BrokerID'] = i
            position1 = position[position['BrokerID'] == i]
            # 两表合并
            mem = pd.merge(chg,
                           position1,
                           on=['date', 'variety', 'BrokerID'],
                           how='left').fillna(0)
            # mem = merge[merge['BrokerID'] == i]
            # print(mem)

            mem = mem.copy()
            mem['today_net'] = mem.apply(
                lambda x: x['long_openIntr'] - x['short_openIntr'], axis=1)
            mem['yesterday_net'] = mem.groupby(['variety', 'BrokerID'
                                                ])['today_net'].shift(1)
            mem['tomorrow_chg'] = mem.groupby(['variety', 'BrokerID'
                                               ])['change_index'].shift(-1)
            mem['net_chg'] = mem.apply(
                lambda x: x['today_net'] - x['yesterday_net'], axis=1)
            #
            mem['count'] = mem['net_chg'].count()
            # mem = mem.rename(columns={'long_open_interest': 'long_openIntr', 'long_open_interest_chg': 'long_openIntr_chg', 'short_open_interest': 'short_openIntr','short_open_interest_chg': 'short_openIntr_chg'})
            # mem['change'] = mem.groupby(['variety', 'BrokerID'])['close_index'].shif(1)
            mem['change'] = mem['close_index'] - mem['close_index'].shift(1)

            # 时间窗口相关系数
            # mem['corr'] = mem['net_chg'].rolling(window=240).corr(mem['change_index'])
            # mem['corr2'] = mem['net_chg'].rolling(window=240).corr(mem['tomorrow_chg']).shift(1)
            # mem['corr3'] = mem['today_net'].rolling(window=240).corr(mem['change'])
            #
            mem['lot'] = 0
            # mem = mem.copy()
            mem['lot'] = mem.apply(lambda x: 0 if x['today_net'] == 0 else 1
                                   if x['today_net'] > 0 else -1,
                                   axis=1)
            mem['lot'] = mem['lot'].shift(1).fillna(0)
            mem['pnl'] = mem['change'] * mem['lot']
            # mem['fee']=0
            # mem['fee'][mem['lot'] != mem['lot'].shif(1)] = mem['close_index'] * 2*1
            mem['netpnl'] = mem['pnl']
            mem['cumpnl'] = mem['netpnl'].rolling(roll).sum()

            # mem['date'] = pd.to_datetime(mem['date'])

            # #画图
            # mem = mem.set_index('date')
            # with pd.plotting.plot_params.use('x_compat', True):  # 方法一
            #     mem[['cumpnl']].plot(color='r',title=mem[u'BrokerID'][0]+" "+var+' '+end)
            #     mem['today_net'].plot(secondary_y=['today_net'])
            #     plt.ylabel('净持仓')
            # plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
            # plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
            # plt.show()

            # plt.plot(mem['cumpnl'])
            # print(mem)

            # flows = mem[mem['cumpnl'] > 0]
            # flows.sort_values('cumpnl', inplace=False)
            # print(flows)
            # # flows = flows[['date', 'variety', 'BrokerID', 'corr', 'corr2', 'today_net', 'net_chg', 'corr3',
            #                'cumpnl']].sort_values('cumpnl',
            #                                       inplace=False)  # [['date','variety','BrokerID','corr','corr2','cumpnl']]
            # flows = flows.rename(columns={'today_net': '净持仓', 'cumpnl': '累计盈亏点数', 'net_chg': '净持仓变化量', 'corr3': '相关系数'})
            # print(flows[['variety','BrokerID','净持仓','净持仓变化量','累计盈亏点数']])
            # print(flows)
            # print(flows.sort_values('累计盈亏点数'))
            # mem=mem.groupby()
            # print(mem)
            # print(flows['净持仓'].sum())

            # mem = mem[-1:]
            print(mem)

            df1 = pd.DataFrame(mem)
            df = df.append(df1)

            # print(df.tail(20))
        except:
            continue
    return df
Exemple #40
0
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
    """ Cleans the column names of the provided Pandas Dataframe and optionally provides hints on duplicate \
    and long column names.

    Parameters
    ----------
    data : pd.DataFrame
        Original Dataframe with columns to be cleaned
    hints : bool, optional
        Print out hints on column name duplication and colum name length, by default True

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame with cleaned column names
    """

    _validate_input_bool(hints, "hints")

    for i, col in enumerate(data.columns):
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
        column = col
        for match in matches:
            column = column.replace(match, match[0] + "_" + match[1])
            data.rename(columns={data.columns[i]: column}, inplace=True)

    data.columns = (
        data.columns.str.replace("\n", "_")
        .str.replace("(", "_")
        .str.replace(")", "_")
        .str.replace("'", "_")
        .str.replace('"', "_")
        .str.replace(".", "_")
        .str.replace("!", "_")
        .str.replace("?", "_")
        .str.replace(":", "_")
        .str.replace(";", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
        .str.replace("+", "_plus_")
        .str.replace("*", "_times_")
        .str.replace("ä", "ae")
        .str.replace("ö", "oe")
        .str.replace("ü", "ue")
        .str.replace("ß", "ss")
        .str.replace("%", "_percent_")
        .str.replace("$", "_dollar_")
        .str.replace("€", "_euro_")
        .str.replace("@", "_at_")
        .str.replace("#", "_number_")
        .str.replace("&", "_and_")
        .str.lower()
        .str.replace("   ", " ")
        .str.replace("  ", " ")
        .str.replace(" ", "_")
        .str.replace("___", "_")
        .str.replace("__", "_")
        .str.strip("_")
    )

    dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x]
    if len(dupl_idx) > 0:
        dupl_before = data.columns[dupl_idx].tolist()
        data.columns = [
            col if col not in data.columns[:i] else col + "_" + str(i) for i, col in enumerate(data.columns)
        ]
        if hints:
            print(
                f"- Duplicate column names detected! Columns with index {dupl_idx} and names {dupl_before}) "
                f"have been renamed to {data.columns[dupl_idx].tolist()}."
            )

    long_col_names = [x for x in data.columns if len(x) > 25]
    if len(long_col_names) > 0 and hints:
        print(
            f"- Long column names detected (>25 characters)! Consider renaming the following columns "
            f"{long_col_names}."
        )

    return data
Exemple #41
0
def rename_columns(input: pd.DataFrame) -> pd.DataFrame:
    return input.rename(columns={
        "reported": "total_vaccinations",
    })
def rename_columns(input: pd.DataFrame) -> pd.DataFrame:
    return input.rename(columns={"data": "date"})
Exemple #43
0
            n = 0
            ranked = []
            for i in gotabla['Entry'].drop_duplicates():
                n+=1
                ranked.append([i, str(n)])
            rank = DataFrame(ranked, columns = ['Entry', 'label'])
            
            gotabla = gotabla.merge(rank, on = 'Entry', how = 'left')
            gotabla = gotabla.merge(no_anotadas_uniprot[0][['Entry', 'GO']], on = ['Entry', 'GO'], how = 'left')
            gotabla = gotabla.merge(uniprot_entry_go_term[['Entry', 'Gene']], on = 'Entry', how = 'left')
            gotabla = gotabla.merge(list_input[['Entry', 'values']], on = 'Entry', how = 'left')

            edges_frame_excel = gotabla[['GO','Entry', 'Gene', 'Term','values']]
            edges_frame_excel_uniprot[z] = edges_frame_excel
            if labelnode == 'Gene Name':
                gotabla = gotabla.rename({'Gene':'Entry', 'Entry':'Gene'}, axis='columns')
            if labelnode == 'UniProt ID':
                pass
            go_tablas_uniprot[z] = gotabla.drop_duplicates().reset_index(drop = True)
            del gotabla
            del edges_frame_excel
        else:
            if aprobados_uniprot[z].count().iloc[0] == 1:
                df = aprobados_uniprot[z]
                df['Short_Term'] = termino_corto(df = aprobados_uniprot[z])

                significativos = []
                for x in df.base.drop_duplicates():
                    dff = df[df.base == x]
                    for index, row in dff.iterrows():
                        for i in row.entry.split(';'):
def rename_total_to_votes(df: pd.DataFrame):
    if 'total' in df.columns:
        return df.rename(columns={'total': 'votes'})
    else:
        return df
Exemple #45
0
def clean_colnames(data: pd.DataFrame) -> pd.DataFrame:
    """
    Make sure that all column names are lowercase and don't contain spaces
    """
    clean_names = {x: x.lower().replace(" ", "_") for x in data.columns}
    return data.rename(columns=clean_names)
Exemple #46
0
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(
        columns={
            "DATE": "date",
        }
    )
import numpy as np


df_words = DataFrame(pd.read_csv (r'data_ASL-LEX_DB.csv', sep=';', header='infer', error_bad_lines=False))
df_words.head()

# Clean words: Remove probabilities from ASL-LEX data (x.x)
k=0
l_new_words = []
for i in range(len(df_words)):
    l_new_words.append([])


for index, row in df_words.iterrows():
    for words in [row.values]:
        for word in words:
            if isinstance(word, str):
                clean_word = re.sub(r'\ \(.*$', "", word)
                clean_word = clean_word.upper()
                # print(clean_word)
                # print(k)
                l_new_words[k].append(clean_word)
            else:
                l_new_words[k].append(np.NaN)
    k=k+1

df_ASLLEX_clean = DataFrame(l_new_words)
# print(df_new_words)
df_ASLLEX_clean.rename(columns={0: 'Gloss'}, inplace=True)
df_ASLLEX_clean.to_csv('ASL-LEX_clean.csv', sep=',', index = False)
Exemple #48
0
    # # # 重命名轴索引
    # # # # 跟Series中的值一样,轴标签也可以通过函数或映射进行转换,从而得到一个新对象,轴还可以被就地修改,而无需新建一个数据结构
    data = DataFrame(np.arange(12).reshape((3, 4)),
                     index=['Ohio', 'Colorado', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
    # # # # 同Series一样,轴标签也有一个map方法
    # print(data.index.map(str.upper))
    data.index = data.index.map(str.upper)
    # print(data)
    # # # # 如果想要创建数据及的转换版(而不是修改原始数据),比较使用的方法是rename
    # print(data.rename(index=str.title, columns=str.upper))
    # # # # rename可以结合字典型对象实现对部分轴标签的更新
    # print(data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekboo'}))
    # # # # rename帮我们实现了:复制DF并对其索引和列标签进行赋值.如果希望就地修改某个数据集,传入inplace=True即可
    _ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
    # print(data)

    # # # 离散化和面元划分
    # # # # 为了便于分析,连续数据常常被离散化或拆分为'面元'(bin)
    age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
    # # # # 将这些age划分为"18-25,26-35,35-60,60+"几个面元,要实现该功能,需要用pandas的cut函数
    bins = [18, 25, 35, 60, 100]
    cats = pd.cut(age, bins)
    # print(cats)
    # # # # pandas返回的是一个特殊的Categories对象,可以将其看作一组表示面元名称的字符串.实际上,它还有一个表示不同分类名称的levels数组以及一个年龄数据进行行标号的labels属性
    # print(cats.codes)
    # print(cats.categories)
    # print(pd.value_counts(cats))
    # # # # 跟区间的数学符号一样,圆括号表示开端,而方括号则表示闭段,那边是闭端可以通过right=False进行修改
    # print(pd.cut(age, [18, 26, 36, 61, 100], right=False))
Exemple #49
0
class scene_graph(object):
    def __init__(self, args):
        #self.data = DataFrame({"node_feature":[]}, index=[])
        # [class, index, score, bounding_box, 3d_pose, mean, var, pt_number, color]

        self.data = DataFrame(
            {
                "class": [np.zeros(400)],
                "idx": 0,
                "score": [np.zeros(400)],  #check
                "bounding_box": [[0, 0, 0, 0]],
                "3d_pose": [[0, 0, 0]],
                "mean": [[0, 0, 0]],
                "var": [[0, 0, 0]],
                "pt_num": 0,
                "color_hist": [[[0, "red"], [0, "blue"]]],
                "detection_cnt:": 0
            },
            columns=[
                'class', 'idx', 'score', 'bounding_box', '3d_pose', 'mean',
                'var', 'pt_num', 'color_hist', 'detection_cnt'
            ])
        self.rel_data = DataFrame({"relation": []}, index=[])
        self.img_count = 0
        self.pt_num = 0
        self.mean = [0, 0, 0]
        self.var = [0, 0, 0]
        self.args = args
        self.detect_cnt_thres = args.detect_cnt_thres
        self.fig = plt.figure()
        scene_name = args.scannet_path.split('/')[-1]
        self.save_path = osp.join(self.args.vis_result_path, scene_name,
                                  'scene_graph')
        try:
            os.makedirs(self.save_path)
        except:
            pass
        try:
            os.makedirs(osp.join(self.save_path, 'json'))
        except:
            pass
        self.disable_samenode = self.args.disable_samenode
        if self.disable_samenode: self.detect_cnt_thres = 0

    def vis_scene_graph(self,
                        image_scene,
                        idx,
                        test_set,
                        obj_inds,
                        obj_boxes,
                        obj_scores,
                        subject_inds,
                        predicate_inds,
                        object_inds,
                        subject_IDs,
                        object_IDs,
                        triplet_scores,
                        relationships,
                        pix_depth=None,
                        inv_p_matrix=None,
                        inv_R=None,
                        Trans=None,
                        dataset='scannet'):
        updated_image_scene = image_scene.copy()
        sg = Digraph('structs',
                     node_attr={'shape':
                                'plaintext'})  # initialize scene graph tool
        if dataset == 'scannet':  #scannet
            print(
                '-ID--|----Object-----|Score|3D_position (x, y, z)|---var-------------|---color------'
            )
        else:
            print(
                '-ID--|----Object-----|---Score---------------------------------------------'
            )

        ax = self.fig.add_subplot(111, projection='3d')
        #print('sfdsfdfsd:',obj_boxes.shape)
        for i, obj_ind in enumerate(
                obj_inds):  # loop for bounding boxes on each images

            if dataset == 'scannet':
                '''1. Get Color Histogram'''
                # color_hist
                # ex: [[num_pixels1,color1],[num_pixels2,color2],...,[num_pixelsN,colorN]]
                #     [[362        ,'red' ],[2          ,'blue'],...,[3          ,'gray']]
                box_whole_img = image_scene[
                    int(obj_boxes[i][1]):int(obj_boxes[i][3]),
                    int(obj_boxes[i][0]):int(obj_boxes[i][2])]
                color_hist = get_color_hist2(box_whole_img)
                '''2. Get Center Patch '''
                # Define bounding box info
                width = int(obj_boxes[i][2]) - int(obj_boxes[i][0])
                height = int(obj_boxes[i][3]) - int(obj_boxes[i][1])
                box_center_x = int(obj_boxes[i][0]) + width / 2
                box_center_y = int(obj_boxes[i][1]) + height / 2
                # using belows to find mean and variance of each bounding boxes
                # pop 1/5 size window_box from object bounding boxes
                range_x_min, range_x_max, range_y_min, range_y_max = make_window_size(
                    width, height, obj_boxes[i])
                # Crop center patch
                box_center_img = image_scene[range_y_min:range_y_max,
                                             range_x_min:range_x_max]
                '''3. Get 3D positions of the Centor Patch'''
                window_3d_pts = []
                for pt_x in range(range_x_min, range_x_max):
                    for pt_y in range(range_y_min, range_y_max):
                        pose_2d_window = np.matrix([pt_x, pt_y, 1])
                        pose_3d_window = pix_depth[pt_x][pt_y] * np.matmul(
                            inv_p_matrix, pose_2d_window.transpose())
                        pose_3d_world_coord_window = np.matmul(
                            inv_R, pose_3d_window[0:3] - Trans.transpose())
                        if not isNoisyPoint(pose_3d_world_coord_window):
                            # save several points in window_box to calculate mean and variance
                            window_3d_pts.append([
                                pose_3d_world_coord_window.item(0),
                                pose_3d_world_coord_window.item(1),
                                pose_3d_world_coord_window.item(2)
                            ])
                # window_3d_pts
                # ex: [[X_1,Y_1,Z_1],[X_2,Y_2,Z_2],...,[X_N,Y_N,Z_N]]

                # window_3d_pts = []
                # for pt_x in range(int(obj_boxes[i][0]), int(obj_boxes[i][2])):
                #     for pt_y in range(int(obj_boxes[i][1]), int(obj_boxes[i][3])):
                #         pose_2d_window = np.matrix([pt_x, pt_y, 1])
                #         pose_3d_window = pix_depth[pt_x][pt_y] * np.matmul(inv_p_matrix, pose_2d_window.transpose())
                #         pose_3d_world_coord_window = np.matmul(inv_R, pose_3d_window[0:3] - Trans.transpose())
                #         if not isNoisyPoint(pose_3d_world_coord_window):
                #             # save several points in window_box to calculate mean and variance
                #             window_3d_pts.append([pose_3d_world_coord_window.item(0), pose_3d_world_coord_window.item(1), pose_3d_world_coord_window.item(2)])

                window_3d_pts = outlier_filter(window_3d_pts)

                #window_3d_pts = np.array(window_3d_pts,dtype=np.float32)
                #cloud = pcl.PointCloud()
                #cloud.from_array(window_3d_pts)
                #outlier_filter = cloud.make_statistical_outlier_filter()
                #outlier_filter.set_mean_k(min(len(window_3d_pts),10))
                #outlier_filter.set_std_dev_mul_thresh(1.0)
                #cloud_filtered = outlier_filter.filter()
                #window_3d_pts = cloud_filtered.to_array().tolist()

                # arr = np.array(window_3d_pts,dtype=np.float).reshape(-1,3)
                # if arr.size>0:
                #     ax.scatter(-arr[:,0],-arr[:,1],-arr[:,2],)
                #     #ax.set_xlim(-2000, 2000)
                #     #ax.set_ylim(-2000, 2000)
                #     #ax.set_zlim(-2000, 2000)
                #
                #     self.fig.show()
                #     plt.pause(0.01)
                #     plt.hold(True)
                # cv2.waitKey(0)
                '''4. Get a 3D position of the Center Patch's Center point'''
                # find 3D point of the bounding box(the center patch)'s center
                curr_pt_num, curr_mean, curr_var = Measure_new_Gaussian_distribution(
                    window_3d_pts)
                # ex: np.matrix([[X_1],[Y_1],[Z_1]])

                # get object class names as strings
                box_cls = [
                    test_set.object_classes[obj_ind[0]],
                    test_set.object_classes[obj_ind[1]],
                    test_set.object_classes[obj_ind[2]]
                ]
                # box_cls: ['pillow','bag','cat']
                box_score = obj_scores[i]
                # box_score: [0.2,0.1,0.01]
                cls_scores = np.zeros(400)
                for cls_idx, cls_score in zip(obj_ind, obj_scores[i]):
                    cls_scores[cls_idx] += cls_score  # check
                '''5. Save Object Recognition Results in DataFrame Format'''
                if (self.img_count == 0):
                    # first image -> make new node
                    box_id = i
                    self.pt_num, self.mean, self.var = Measure_new_Gaussian_distribution(
                        window_3d_pts)
                    # check
                    start_data = {
                        "class":
                        cls_scores,
                        "idx":
                        box_id,
                        "score":
                        box_score,
                        "bounding_box":
                        [box_center_x, box_center_y, width, height],
                        "3d_pose": [
                            int(self.mean[0]),
                            int(self.mean[1]),
                            int(self.mean[2])
                        ],
                        "mean":
                        self.mean,
                        "var":
                        self.var,
                        "pt_num":
                        self.pt_num,
                        "color_hist":
                        color_hist,
                        "detection_cnt":
                        1
                    }
                    obj_boxes[i][4] = box_id
                    self.data.loc[len(self.data)] = start_data
                    if (i == 0):
                        self.data.drop(self.data.index[0], inplace=True)
                        self.data.rename(index={1: 0}, inplace=True)
                    #print(self.data)

                else:
                    # get node similarity score
                    node_score, max_score_index = node_update(
                        window_3d_pts, self.data, curr_mean, curr_var, box_cls,
                        obj_scores[i], color_hist, test_set)
                    threshold = 0.8127

                    if node_score > threshold and not self.disable_samenode:
                        # change value of global_node
                        # change global_node[max_score_index]
                        print("node updated!!!")
                        for cls_idx, cls_score in zip(obj_ind, obj_scores[i]):
                            self.data.at[
                                max_score_index,
                                'class'][cls_idx] += cls_score  # check

                        #self.data.at[max_score_index, "class"] = box_cls
                        self.data.at[max_score_index, "score"] = node_score
                        self.pt_num, self.mean, self.var = Measure_added_Gaussian_distribution(
                            window_3d_pts,
                            self.data.ix[max_score_index]["mean"],
                            self.data.ix[max_score_index]["var"],
                            self.data.ix[max_score_index]["pt_num"],
                            len(window_3d_pts))
                        self.data.at[max_score_index, "mean"] = self.mean
                        self.data.at[max_score_index, "var"] = self.var
                        self.data.at[max_score_index, "pt_num"] = self.pt_num
                        self.data.at[max_score_index,
                                     "color_hist"] = color_hist
                        self.data.at[max_score_index,
                                     "detection_cnt"] = self.data.ix[
                                         max_score_index]["detection_cnt"] + 1
                        box_id = self.data.ix[max_score_index]["idx"]
                        obj_boxes[i][4] = box_id
                    else:
                        # make new_node in global_node
                        # [class, index, score, bounding_box, 3d_pose, mean, var, pt_number, color_hist]
                        box_id = len(self.data) + 1
                        obj_boxes[i][4] = box_id
                        self.pt_num, self.mean, self.var = Measure_new_Gaussian_distribution(
                            window_3d_pts)
                        global_node_num = len(self.data)
                        add_node_list = [
                            cls_scores, box_id, box_score,
                            [box_center_x, box_center_y, width, height],
                            [self.mean[0], self.mean[1], self.mean[2]],
                            self.mean, self.var, self.pt_num, color_hist, 1
                        ]
                        self.data.loc[len(self.data)] = add_node_list

                # if object index was changed, update relation's object index also
                '''6. Print object info'''
                print(
                    '{obj_ID:5} {obj_cls:15}  {obj_score:4.2f} {object_3d_pose:20}    {obj_var:20} {obj_color:15}'
                    .format(obj_ID=box_id,
                            obj_cls=box_cls[0],
                            obj_score=box_score[0],
                            object_3d_pose=[
                                self.mean[0], self.mean[1], self.mean[2]
                            ],
                            obj_var=self.var,
                            obj_color=color_hist[0][1]))

            else:  # TODO: for visual_genome
                raise NotImplementedError
            '''7. Plot '''
            # updated object_detection
            cv2.rectangle(updated_image_scene,
                          (int(obj_boxes[i][0]), int(obj_boxes[i][1])),
                          (int(obj_boxes[i][2]), int(obj_boxes[i][3])),
                          colorlist[int(obj_boxes[i][4])], 2)
            font_scale = 0.5
            txt = str(box_id) + '. ' + str(box_cls[0]) + ' ' + str(
                round(box_score[0], 2))
            ((txt_w, txt_h), _) = cv2.getTextSize(txt,
                                                  cv2.FONT_HERSHEY_SIMPLEX,
                                                  font_scale, 1)
            # Place text background.
            x0, y0 = int(obj_boxes[i][0]), int(obj_boxes[i][3])
            back_tl = x0, y0 - int(1.3 * txt_h)
            back_br = x0 + txt_w, y0
            cv2.rectangle(updated_image_scene, back_tl, back_br,
                          colorlist[int(obj_boxes[i][4])], -1)
            cv2.putText(updated_image_scene, txt, (x0, y0 - 2),
                        cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255),
                        1)

        # add ID per bbox

        rel_prev_num = len(self.rel_data)
        print(
            '-------Subject--------|-------Predicate-----|--------Object---------|--Score-'
        )
        for i, relation in enumerate(relationships):
            # update relation's class also

            # accumulate relation_list
            if str(int(obj_boxes[int(relation[0])][4])) != str(
                    int(obj_boxes[int(relation[1])][4])):
                # filter out triplets whose sbj == obj
                self.rel_data.loc[len(self.rel_data)] = [[
                    str(int(obj_boxes[int(relation[0])][4])),
                    int(relation[2]),
                    str(int(obj_boxes[int(relation[1])][4]))
                ]]

                print('{sbj_cls:9} {sbj_ID:4} {sbj_score:1.3f}  |  '
                      '{pred_cls:11} {pred_score:1.3f}  |  '
                      '{obj_cls:9} {obj_ID:4} {obj_score:1.3f}  |  '
                      '{triplet_score:1.3f}'.format(
                          sbj_cls=test_set.object_classes[obj_inds[:, 0][int(
                              relation[0])]],
                          sbj_score=obj_scores[:, 0][int(relation[0])],
                          sbj_ID=str(int(obj_boxes[int(relation[0])][4])),
                          pred_cls=test_set.predicate_classes[int(
                              relation[2])],
                          pred_score=relation[3] /
                          obj_scores[:, 0][int(relation[0])] /
                          obj_scores[:, 0][int(relation[1])],
                          obj_cls=test_set.object_classes[obj_inds[:, 0][int(
                              relation[1])]],
                          obj_score=obj_scores[:, 0][int(relation[1])],
                          obj_ID=str(int(obj_boxes[int(relation[1])][4])),
                          triplet_score=relation[3]))

        rel_new_num = len(self.rel_data)

        # Draw scene graph
        if (rel_prev_num != rel_new_num):
            Draw_connected_scene_graph(self.data, self.rel_data,
                                       self.img_count, test_set, sg, idx,
                                       self.detect_cnt_thres,
                                       self.args.plot_graph, self.save_path)
        #sg.view()

        # it's help to select starting point of first image manually
        self.img_count += 1

        return updated_image_scene
Exemple #50
0
def estimate_time_series(
    data: pd.DataFrame,
    spline_options: Dict,
    n_knots: int,
    dep_var: str,
    dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x,
    weight_data: pd.DataFrame = None,
    dep_var_se: str = None,
    dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x,
    diff: bool = False,
    num_submodels: int = 25,
    single_random_knot: bool = False,
    min_interval_days: int = 7,
    dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x,
    split_l_interval: bool = False,
    split_r_interval: bool = False,
    verbose: bool = False,
) -> Tuple[pd.DataFrame, pd.Series, MRBeRT]:
    if verbose: logger.info('Formatting data.')
    data = data.copy()
    data[dep_var] = dep_trans_in(data[dep_var])
    if diff:
        if verbose:
            logger.info(
                'For diff model, drop day1 (i.e., if day0 is > 0, day0->day1 diff would be hugely negative).'
            )
        data[dep_var] = data[dep_var].diff()
        data[dep_var] = data[dep_var][data[dep_var].diff().notnull()]
    if data[[dep_var]].shape[1] > 1:
        reshape = True
        data = reshape_data_long(data, dep_var)
        if weight_data is not None:
            weight_data = reshape_data_long(weight_data, dep_var_se)
    else:
        reshape = False
    if weight_data is not None:
        if (data['date'] != weight_data['date']).any():
            raise ValueError(
                'Dates in `data` and `weight_data` not identical.')
        data['se'] = dep_se_trans_in(weight_data[dep_var_se])
    else:
        data['se'] = 1.
    data = data.rename(columns={dep_var: 'y'})
    day0 = data['date'].min()
    keep_vars = ['date', 'y', 'se']
    data = data.loc[:, keep_vars]
    start_len = len(data)
    data = data.dropna()
    end_len = len(data)
    if start_len != end_len and not reshape:
        if verbose: logger.debug('NAs in data')
    data['t'] = (data['date'] - day0).dt.days

    col_args = {
        'col_obs': 'y',
        'col_obs_se': 'se',
        'col_covs': ['t'],
        #'col_study_id':'date',
    }
    if verbose: logger.info('Getting base knots.')
    min_interval = min_interval_days / data['t'].max()
    if num_submodels == 1 and single_random_knot:
        spline_knots = get_ensemble_knots(n_knots, min_interval, 1)[0]
    else:
        spline_knots = np.linspace(0., 1., n_knots)

    if split_l_interval or split_r_interval:
        if num_submodels > 1:
            raise ValueError(
                'Would need to set up functionality to split segments for ensemble.'
            )
        if split_l_interval:
            n_knots += 1
            spline_knots = np.insert(spline_knots, 0, spline_knots[:2].mean())
        if split_r_interval:
            n_knots += 1
            spline_knots = np.insert(spline_knots, -1,
                                     spline_knots[-2:].mean())

    if verbose: logger.info('Creating model data.')
    mr_data = MRData()
    mr_data.load_df(data, **col_args)
    spline_model = LinearCovModel('t',
                                  use_re=False,
                                  use_spline=True,
                                  use_spline_intercept=True,
                                  spline_knots=spline_knots,
                                  **spline_options)
    if num_submodels > 1:
        if verbose: logger.info('Sampling knots.')
        ensemble_knots = get_ensemble_knots(n_knots, min_interval,
                                            num_submodels)

        if verbose: logger.info('Initializing model.')
        mr_model = MRBeRT(mr_data, spline_model, ensemble_knots)
    else:
        if verbose: logger.info('Initializing model.')
        mr_model = MRBRT(mr_data, [spline_model])

    if verbose: logger.info('Fitting model.')
    mr_model.fit_model()

    if num_submodels > 1:
        if verbose: logger.info('Scoring submodels.')
        mr_model.score_model()

    data = data.set_index('date')[['y', 'se']]

    if verbose: logger.info('Making prediction.')
    smooth_data = predict_time_series(
        day0=day0,
        dep_var=dep_var,
        mr_model=mr_model,
        dep_trans_out=dep_trans_out,
        diff=diff,
    )

    return data, smooth_data, mr_model
Exemple #51
0
 def rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
     if self.columns_rename:
         return df.rename(columns=self.columns_rename)
     return df
a = np.arange(1,17).reshape(4,4)
data = DataFrame(a)
data
#计算列的百分比变化,如果想计算行设置axis=1
data.pct_change()      #新值相对旧值增加的百分比

print(data.head())    #输出前五行,默认是5,可以通过设置n参数来设置输出的行数
print(data.tail())    #输出最后五行

#计算dataframe中列与列的相关系数
#重新修改索引名
row=['a','b','c','d']
col=['one','two','three','four']
data.index
data.columns
data.rename(index={2:'mm'})           #利用rename修改单个索引名
data.rename(columns={1:'xxx'})
data.index=row                        #利用index对象修改整个索引名
data.columns=col
data

data_pc=data.pct_change()

#计算dataframe的列与列的相关系数
data_pc.one.corr(data_pc.three)

#计算dataframe的列与列的协方差
data_pc.two.cov(data_pc.four)

#返回打dataframe的各列间的相关系数与协方差,返回矩阵结果
data_pc.corr()
def json_normalize(data,
                   record_path=None,
                   meta=None,
                   meta_prefix=None,
                   record_prefix=None,
                   errors='raise',
                   sep='.'):
    """
    Normalize semi-structured JSON data into a flat table.

    Parameters
    ----------
    data : dict or list of dicts
        Unserialized JSON objects
    record_path : string or list of strings, default None
        Path in each object to list of records. If not passed, data will be
        assumed to be an array of records
    meta : list of paths (string or list of strings), default None
        Fields to use as metadata for each record in resulting table
    meta_prefix : string, default None
    record_prefix : string, default None
        If True, prefix records with dotted (?) path, e.g. foo.bar.field if
        path to records is ['foo', 'bar']
    errors : {'raise', 'ignore'}, default 'raise'

        * 'ignore' : will ignore KeyError if keys listed in meta are not
          always present
        * 'raise' : will raise KeyError if keys listed in meta are not
          always present

        .. versionadded:: 0.20.0

    sep : string, default '.'
        Nested records will generate names separated by sep,
        e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar

        .. versionadded:: 0.20.0

    Returns
    -------
    frame : DataFrame

    Examples
    --------

    >>> from pandas.io.json import json_normalize
    >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
    ...         {'name': {'given': 'Mose', 'family': 'Regner'}},
    ...         {'id': 2, 'name': 'Faye Raker'}]
    >>> json_normalize(data)
        id        name name.family name.first name.given name.last
    0  1.0         NaN         NaN     Coleen        NaN      Volk
    1  NaN         NaN      Regner        NaN       Mose       NaN
    2  2.0  Faye Raker         NaN        NaN        NaN       NaN

    >>> data = [{'state': 'Florida',
    ...          'shortname': 'FL',
    ...          'info': {
    ...               'governor': 'Rick Scott'
    ...          },
    ...          'counties': [{'name': 'Dade', 'population': 12345},
    ...                      {'name': 'Broward', 'population': 40000},
    ...                      {'name': 'Palm Beach', 'population': 60000}]},
    ...         {'state': 'Ohio',
    ...          'shortname': 'OH',
    ...          'info': {
    ...               'governor': 'John Kasich'
    ...          },
    ...          'counties': [{'name': 'Summit', 'population': 1234},
    ...                       {'name': 'Cuyahoga', 'population': 1337}]}]
    >>> result = json_normalize(data, 'counties', ['state', 'shortname',
    ...                                           ['info', 'governor']])
    >>> result
             name  population info.governor    state shortname
    0        Dade       12345    Rick Scott  Florida        FL
    1     Broward       40000    Rick Scott  Florida        FL
    2  Palm Beach       60000    Rick Scott  Florida        FL
    3      Summit        1234   John Kasich     Ohio        OH
    4    Cuyahoga        1337   John Kasich     Ohio        OH

    >>> data = {'A': [1, 2]}
    >>> json_normalize(data, 'A', record_prefix='Prefix.')
        Prefix.0
    0          1
    1          2
    """
    def _pull_field(js, spec):
        result = js
        if isinstance(spec, list):
            for field in spec:
                result = result[field]
        else:
            result = result[spec]

        return result

    if isinstance(data, list) and not data:
        return DataFrame()

    # A bit of a hackjob
    if isinstance(data, dict):
        data = [data]

    if record_path is None:
        if any([isinstance(x, dict) for x in compat.itervalues(y)]
               for y in data):
            # naive normalization, this is idempotent for flat records
            # and potentially will inflate the data considerably for
            # deeply nested structures:
            #  {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
            #
            # TODO: handle record value which are lists, at least error
            #       reasonably
            data = nested_to_record(data, sep=sep)
        return DataFrame(data)
    elif not isinstance(record_path, list):
        record_path = [record_path]

    if meta is None:
        meta = []
    elif not isinstance(meta, list):
        meta = [meta]

    meta = [m if isinstance(m, list) else [m] for m in meta]

    # Disastrously inefficient for now
    records = []
    lengths = []

    meta_vals = defaultdict(list)
    if not isinstance(sep, compat.string_types):
        sep = str(sep)
    meta_keys = [sep.join(val) for val in meta]

    def _recursive_extract(data, path, seen_meta, level=0):
        if isinstance(data, dict):
            data = [data]
        if len(path) > 1:
            for obj in data:
                for val, key in zip(meta, meta_keys):
                    if level + 1 == len(val):
                        seen_meta[key] = _pull_field(obj, val[-1])

                _recursive_extract(obj[path[0]],
                                   path[1:],
                                   seen_meta,
                                   level=level + 1)
        else:
            for obj in data:
                recs = _pull_field(obj, path[0])

                # For repeating the metadata later
                lengths.append(len(recs))

                for val, key in zip(meta, meta_keys):
                    if level + 1 > len(val):
                        meta_val = seen_meta[key]
                    else:
                        try:
                            meta_val = _pull_field(obj, val[level:])
                        except KeyError as e:
                            if errors == 'ignore':
                                meta_val = np.nan
                            else:
                                raise KeyError(
                                    "Try running with "
                                    "errors='ignore' as key "
                                    "{err} is not always present".format(
                                        err=e))
                    meta_vals[key].append(meta_val)

                records.extend(recs)

    _recursive_extract(data, record_path, {}, level=0)

    result = DataFrame(records)

    if record_prefix is not None:
        result = result.rename(
            columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))

    # Data types, a problem
    for k, v in compat.iteritems(meta_vals):
        if meta_prefix is not None:
            k = meta_prefix + k

        if k in result:
            raise ValueError('Conflicting metadata name {name}, '
                             'need distinguishing prefix '.format(name=k))

        result[k] = np.array(v).repeat(lengths)

    return result
def prophet(  # pylint: disable=too-many-arguments
    df: DataFrame,
    time_grain: str,
    periods: int,
    confidence_interval: float,
    yearly_seasonality: Optional[Union[bool, int]] = None,
    weekly_seasonality: Optional[Union[bool, int]] = None,
    daily_seasonality: Optional[Union[bool, int]] = None,
) -> DataFrame:
    """
    Add forecasts to each series in a timeseries dataframe, along with confidence
    intervals for the prediction. For each series, the operation creates three
    new columns with the column name suffixed with the following values:

    - `__yhat`: the forecast for the given date
    - `__yhat_lower`: the lower bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date


    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param time_grain: Time grain used to specify time period increments in prediction
    :param periods: Time periods (in units of `time_grain`) to predict into the future
    :param confidence_interval: Width of predicted confidence interval
    :param yearly_seasonality: Should yearly seasonality be applied.
           An integer value will specify Fourier order of seasonality.
    :param weekly_seasonality: Should weekly seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :param daily_seasonality: Should daily seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :return: DataFrame with contributions, with temporal column at beginning if present
    """
    # validate inputs
    if not time_grain:
        raise QueryObjectValidationError(_("Time grain missing"))
    if time_grain not in PROPHET_TIME_GRAIN_MAP:
        raise QueryObjectValidationError(
            _(
                "Unsupported time grain: %(time_grain)s",
                time_grain=time_grain,
            ))
    freq = PROPHET_TIME_GRAIN_MAP[time_grain]
    # check type at runtime due to marhsmallow schema not being able to handle
    # union types
    if not periods or periods < 0 or not isinstance(periods, int):
        raise QueryObjectValidationError(
            _("Periods must be a positive integer value"))
    if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1:
        raise QueryObjectValidationError(
            _("Confidence interval must be between 0 and 1 (exclusive)"))
    if DTTM_ALIAS not in df.columns:
        raise QueryObjectValidationError(
            _("DataFrame must include temporal column"))
    if len(df.columns) < 2:
        raise QueryObjectValidationError(
            _("DataFrame include at least one series"))

    target_df = DataFrame()
    for column in [column for column in df.columns if column != DTTM_ALIAS]:
        fit_df = _prophet_fit_and_predict(
            df=df[[DTTM_ALIAS, column]].rename(columns={
                DTTM_ALIAS: "ds",
                column: "y"
            }),
            confidence_interval=confidence_interval,
            yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality),
            weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality),
            daily_seasonality=_prophet_parse_seasonality(daily_seasonality),
            periods=periods,
            freq=freq,
        )
        new_columns = [
            f"{column}__yhat",
            f"{column}__yhat_lower",
            f"{column}__yhat_upper",
            f"{column}",
        ]
        fit_df.columns = new_columns
        if target_df.empty:
            target_df = fit_df
        else:
            for new_column in new_columns:
                target_df = target_df.assign(
                    **{new_column: fit_df[new_column]})
    target_df.reset_index(level=0, inplace=True)
    return target_df.rename(columns={"ds": DTTM_ALIAS})
type(frame3['col2'])
frame3[['col2']]
type(frame3[['col2']])
frame3[['col2','col1']]
type(frame3[['col2','col1']])

frame3.ix[1,'col1']
frame3.ix[[1,0],['col2','col1']]

##Besides .ix, one can also use .loc etc

# Changing names of columns in a DataFrame
frame1

#We can use rename() method to rename a column, rename accepts a dictionary 
frame1.rename(columns={'col1':'one','col2':'two'})
frame1

frame1.rename(columns={'col1':'one','col2':'two'},inplace=True)
frame1

frame1.columns=['Col1','Col2']
frame1

#One at a time/a few at a time
frame1.rename(columns={'Col1':'one'},inplace=True)
frame1
# Basic DataFrame methods
frame3.columns

frame3.head()
Exemple #56
0
def prepare_visualization_group(df: DataFrame = None, **kwargs) -> List[Any]:
    """Creates plot, table and download link for data frame.

    Arguments:
        df: The Dataframe to plot
        content: Dict[str, str]
            Mapping for translating columns and index.
        max_y_axis:  int
            Maximal value on y-axis
        labels: List[str]
            Columns to display
        table_mod: int
            Displays only each `table_mod` row in table

    """
    result = [{}, None, None]
    if df is not None and isinstance(df, DataFrame):

        date_column = "date"
        day_column = "day"

        # Translate column and index if specified
        content = kwargs.get("content", None)
        if content:
            columns = {
                col: content[col]
                for col in df.columns if col in content
            }
            index = ({
                df.index.name: content[df.index.name]
            } if df.index.name and df.index.name in content else None)
            df = df.rename(columns=columns, index=index)
            date_column = content.get(date_column, date_column)
            day_column = content.get(day_column, day_column)

        plot_data = plot_dataframe(
            df.dropna().set_index(date_column).drop(columns=[day_column]),
            max_y_axis=kwargs.get("max_y_axis", None),
        )

        # translate back for backwards compability of build_table
        column_map = {day_column: "day", date_column: "date"}
        table = (
            df_to_html_table(
                build_table(
                    df=df.rename(columns=column_map),
                    labels=kwargs.get("labels", df.columns),
                    modulo=kwargs.get("table_mod", 7),
                ),
                formats={
                    float: int,
                    (date, datetime): lambda d: d.strftime(DATE_FORMAT),
                },
            )
            # if kwargs.get("show_tables", None)
            # else None
        )

        # Convert columnnames to lowercase
        column_map = {col: col.lower() for col in df.columns}
        csv = build_csv_download(df.rename(columns=column_map))
        result = [plot_data, table, csv]

    return result
def fix_misnamed_cols(df: pd.DataFrame) -> pd.DataFrame:
    if 'attribute' in df.columns and 'value' in df.columns:
        return df.rename(columns={'attribute': 'candidate', 'value': 'votes'})
    else:
        return df
def translate_columns(input: pd.DataFrame) -> pd.DataFrame:
    return input.rename(columns={"datum": "date"})
class Device(object):
    ''' Main implementation of the device class '''
    def __init__(self, blueprint=None, descriptor={}):
        '''
        Creates an instance of device. Devices are objects that contain sensors readings, metrics 
        (calculations based on sensors readings), and metadata such as units, dates, frequency and source

        Parameters:
        -----------
        blueprint: String
            Default: 'sck_21'
            Defines the type of device. For instance: sck_21, sck_20, csic_station, muv_station
            parrot_soil, sc_20_station, sc_21_station. A list of all the blueprints is found in 
            config.blueprints_urls and accessible via the scdata.utils.load_blueprints(urls) function.
            The blueprint can also be defined from the postprocessing info in SCAPI. The manual
            parameter passed here is overriden by that of the API.

        descriptor: dict()
            Default: empty: std_out('Empty dataframe, ignoring', 'WARNING') dict
            A dictionary containing information about the device itself. Depending on the blueprint, this descriptor
            needs to have different data. If not all the data is present, the corresponding blueprint's default will 
            be used

        Examples:
        ----------
        Device('sck_21', descriptor = {'source': 'api', 'id': '1919'})
            device with sck_21 blueprint with 1919 ID
        Device(descriptor = {'source': 'api', 'id': '1919'})
            device with sck_21 blueprint with 1919 ID

        Returns
        ----------
            Device object
        '''

        self.skip_blueprint = False

        if blueprint is not None:
            self.blueprint = blueprint
            self.skip_blueprint = True
        else:
            self.blueprint = 'sck_21'

        # Set attributes
        if self.blueprint not in config.blueprints:
            raise ValueError(
                f'Specified blueprint {self.blueprint} is not in config')

        self.set_blueprint_attrs(config.blueprints[self.blueprint])
        self.blueprint_loaded_from_url = False
        self.hardware_loaded_from_url = False

        self.description = descriptor
        self.set_descriptor_attrs()

        if self.id is not None: self.id = str(self.id)

        # Postprocessing and forwarding
        self.hardware_url = None
        self.blueprint_url = None
        self.forwarding_params = None
        self.forwarding_request = None
        self.meta = None
        self.latest_postprocessing = None
        self.processed = False
        self.hardware_description = None

        # Add API handler if needed
        if self.source == 'api':

            hmod = __import__('scdata.io.device_api',
                              fromlist=['io.device_api'])
            Hclass = getattr(hmod, self.sources[self.source]['handler'])

            # Create object
            self.api_device = Hclass(did=self.id)

            std_out(f'Checking postprocessing info from API device')

            if self.load_postprocessing() and (
                    self.hardware_url is
                    None):  # or self.blueprint_url is None):
                if config._strict:
                    raise ValueError(
                        'Postprocessing could not be loaded as is incomplete and strict mode is enabled'
                    )
                std_out(
                    f'Postprocessing loaded but with problems (hardware_url: {self.hardware_url} // blueprint_url: {self.blueprint_url}',
                    'WARNING')

        if self.blueprint is None:
            raise ValueError(
                f'Device {self.id} cannot be init without blueprint. Need a blueprint to proceed'
            )
        else:
            std_out(f'Device {self.id} is using {self.blueprint} blueprint')

        self.readings = DataFrame()
        self.loaded = False
        self.options = dict()
        std_out(f'Device {self.id} initialised', 'SUCCESS')

    def set_blueprint_attrs(self, blueprintd):

        # Set attributes
        for bpitem in blueprintd:
            self.__setattr__(bpitem, blueprintd[bpitem])

    def set_descriptor_attrs(self):

        # Descriptor attributes
        for ditem in self.description.keys():
            if ditem not in vars(self):
                std_out(f'Ignoring {ditem} from input', 'WARNING')
                continue
            if type(self.__getattribute__(ditem)) == dict:
                self.__setattr__(
                    ditem,
                    dict_fmerge(self.__getattribute__(ditem),
                                self.description[ditem]))
            else:
                self.__setattr__(ditem, self.description[ditem])

    def check_overrides(self, options={}):

        if 'min_date' in options.keys():
            self.options['min_date'] = options['min_date']
        else:
            self.options['min_date'] = self.min_date

        if 'max_date' in options.keys():
            self.options['max_date'] = options['max_date']
        else:
            self.options['max_date'] = self.max_date

        if 'clean_na' in options.keys():
            self.options['clean_na'] = options['clean_na']
        else:
            self.options['clean_na'] = self.clean_na

        if 'frequency' in options.keys():
            self.options['frequency'] = options['frequency']
        elif self.frequency is not None:
            self.options['frequency'] = self.frequency
        else:
            self.options['frequency'] = '1Min'

    def load_postprocessing(self):

        if self.source != 'api': return None

        if self.sources[self.source]['handler'] != 'ScApiDevice': return None

        # Request to get postprocessing information
        if self.api_device.get_device_postprocessing() is None: return None

        # Put it where it goes
        try:
            self.hardware_url = self.api_device.postprocessing['hardware_url']
            self.blueprint_url = self.api_device.postprocessing[
                'blueprint_url']
            self.latest_postprocessing = self.api_device.postprocessing[
                'latest_postprocessing']
            self.forwarding_params = self.api_device.postprocessing[
                'forwarding_params']
            self.meta = self.api_device.postprocessing['meta']
            inc_postprocessing = False
        except KeyError:
            std_out('Ignoring postprocessing info as its incomplete',
                    'WARNING')
            inc_postprocessing = True
            pass

        if inc_postprocessing: return None

        # Load postprocessing info from url
        if url_checker(
                self.hardware_url) and self.hardware_loaded_from_url == False:

            std_out(f'Loading hardware information from:\n{self.hardware_url}')
            hardware_description = get_json_from_url(self.hardware_url)

            # TODO
            # Add additional checks to hardware_description

            if hardware_description is not None:
                self.hardware_description = hardware_description
                std_out('Hardware described in url is valid', "SUCCESS")
                self.hardware_loaded_from_url = True
            else:
                std_out("Hardware in url is not valid", 'ERROR')
                self.hardware_description = None

        # Find forwarding request
        if self.hardware_description is not None:
            if 'forwarding' in self.hardware_description:
                if self.hardware_description[
                        'forwarding'] in config.connectors:
                    self.forwarding_request = self.hardware_description[
                        'forwarding']
                    std_out(
                        f"Requested a {self.hardware_description['forwarding']} connector for {self.id}"
                    )
                    if self.forwarding_params is None:
                        std_out(
                            'Assuming device has never been posted. Forwarding parameters are empty',
                            'WARNING')
                    else:
                        std_out(
                            f'Connector parameters are not empty: {self.forwarding_params}'
                        )
                else:
                    std_out(
                        f"Requested a {self.hardware_description['forwarding']} connector that is not available. Ignoring",
                        'WARNING')

        # Find postprocessing blueprint
        if self.skip_blueprint:
            std_out(
                'Skipping blueprint as it was defined in device constructor',
                'WARNING')
        if self.blueprint_loaded_from_url == False and not self.skip_blueprint:

            # Case when there is no info stored
            if url_checker(self.blueprint_url):
                std_out(
                    f'blueprint_url in platform is not empty. Loading postprocessing blueprint from:\n{self.blueprint_url}'
                )
                nblueprint = basename(urlparse(
                    self.blueprint_url).path).split('.')[0]
            else:
                std_out(f'blueprint_url in platform is not valid', 'WARNING')
                std_out(
                    f'Checking if there is a blueprint_url in hardware_description'
                )
                if self.hardware_description is None:
                    std_out("Hardware description is not useful for blueprint",
                            'ERROR')
                    return None
                if 'blueprint_url' in self.hardware_description:
                    std_out(
                        f"Trying postprocessing blueprint from:\n{self.hardware_description['blueprint_url']}"
                    )
                    nblueprint = basename(
                        urlparse(self.hardware_description['blueprint_url']).
                        path).split('.')[0]
                    tentative_urls = url_checker(
                        self.hardware_description['blueprint_url'])
                    if len(tentative_urls) > 0:
                        self.blueprint_url = tentative_urls[0]
                    else:
                        std_out('Invalid blueprint', 'ERROR')
                        return None
                else:
                    std_out('Postprocessing not possible without blueprint',
                            'ERROR')
                    return None

            std_out(f'Using hardware postprocessing blueprint: {nblueprint}')
            lblueprint = get_json_from_url(self.blueprint_url)

            if lblueprint is not None:
                self.blueprint = nblueprint
                self.blueprint_loaded_from_url = True
                self.set_blueprint_attrs(lblueprint)
                self.set_descriptor_attrs()
                std_out('Blueprint loaded from url', 'SUCCESS')
            else:
                std_out('Blueprint in url is not valid', 'ERROR')
                return None

        return self.api_device.postprocessing

    def validate(self):
        if self.hardware_description is not None: return True
        else: return False

    def load(self,
             options=None,
             path=None,
             convert_units=True,
             only_unprocessed=False,
             max_amount=None):
        '''
        Loads the device with some options

        Parameters:
        -----------
        options: dict()
            Default: None
            options['min_date'] = date to load data from
                Default to device min_date (from blueprint or test)
            options['max_date'] = date to load data to
                Default to device max_date (from blueprint or test)
            options['clean_na'] = clean na (drop_na, fill_na or None)
                Default to device clean_na (from blueprint or test)
            options['frequency'] = frequency to load data at in pandas format
                Default to device frequency (from blueprint or test) or '1Min'
        path: String
            Default: None
            Path were the csv file is, if any. Normally not needed to be provided, only for internal usage
        convert_units: bool
            Default: True
            Convert units for channels based on config._channel_lut
        only_unprocessed: bool
            Default: False
            Loads only unprocessed data
        max_amount: int
            Default: None
            Trim dataframe to this amount for processing and forwarding purposes
        Returns
        ----------
            True if loaded correctly
        '''

        # Add test overrides if we have them, otherwise set device defaults
        if options is not None: self.check_overrides(options)
        else: self.check_overrides()

        try:
            if self.source == 'csv':
                self.readings = self.readings.combine_first(
                    read_csv_file(join(path, self.processed_data_file),
                                  self.location, self.options['frequency'],
                                  self.options['clean_na'],
                                  self.sources[self.source]['index']))
                if self.readings is not None:
                    self.__convert_names__()

            elif 'api' in self.source:

                # Get device location
                self.location = self.api_device.get_device_timezone()

                if path is None:
                    # Not chached case
                    if only_unprocessed:

                        # Override dates for post-processing
                        if self.latest_postprocessing is not None:
                            hw_latest_postprocess = localise_date(
                                self.latest_postprocessing,
                                'UTC').strftime('%Y-%m-%dT%H:%M:%S')
                            # Override min loading date
                            self.options['min_date'] = hw_latest_postprocess

                    df = self.api_device.get_device_data(
                        self.options['min_date'], self.options['max_date'],
                        self.options['frequency'], self.options['clean_na'])

                    # API Device is not aware of other csv index data, so make it here
                    if 'csv' in self.sources and df is not None:
                        df = df.reindex(
                            df.index.rename(self.sources['csv']['index']))

                    # Combine it with readings if possible
                    if df is not None:
                        self.readings = self.readings.combine_first(df)

                else:
                    # Cached case
                    self.readings = self.readings.combine_first(
                        read_csv_file(join(path,
                                           str(self.id) + '.csv'),
                                      self.location, self.options['frequency'],
                                      self.options['clean_na'],
                                      self.sources['csv']['index']))

        except FileNotFoundError:
            # Handle error
            if 'api' in self.source:
                std_out(
                    f'No cached data file found for device {self.id} in {path}. Moving on',
                    'WARNING')
            elif 'csv' in self.source:
                std_out(f'File not found for device {self.id} in {path}',
                        'ERROR')

            self.loaded = False
        except:
            print_exc()
            self.loaded = False
        else:
            if self.readings is not None:
                self.__check_sensors__()
                if max_amount is not None:
                    self.readings = self.readings.dropna(
                        axis=0, how='all').head(max_amount)
                if not self.readings.empty:
                    # Only add metrics if there is something that can be potentially processed
                    self.__fill_metrics__()
                    self.loaded = True
                    if convert_units: self.__convert_units__()
                else:
                    std_out('Empty dataframe in readings', 'WARNING')

        finally:
            self.processed = False
            return self.loaded

    def __fill_metrics__(self):
        std_out('Checking if metrics need to be added based on hardware info')

        if self.hardware_description is None:
            std_out(f'No hardware url in device {self.id}, ignoring')
            return None

        # Now go through sensor versions and add them to the metrics
        if 'versions' in self.hardware_description:
            for version in self.hardware_description['versions']:

                from_date = version["from"]
                to_date = version["to"]

                # Do not add any metric if the from_date of the calibration is after the last_reading_at
                # as there would be nothing to process
                if from_date > self.api_device.last_reading_at:
                    std_out(
                        'Postprocessing from_date is later than device last_reading_at',
                        'ERROR')
                    return None

                for slot in version["ids"]:

                    # Alphasense type - AAN 803-04
                    if slot.startswith('AS'):

                        sensor_id = version["ids"][slot]
                        as_type = config._as_sensor_codes[sensor_id[0:3]]
                        pollutant = as_type[as_type.index('_') + 1:]
                        if pollutant == 'OX': pollutant = 'O3'

                        # Get working and auxiliary electrode names
                        wen = f"ADC_{slot.strip('AS_')[:slot.index('_')]}_{slot.strip('AS_')[slot.index('_')+1]}"
                        aen = f"ADC_{slot.strip('AS_')[:slot.index('_')]}_{slot.strip('AS_')[slot.index('_')+2]}"

                        if pollutant not in self.metrics:
                            # Create Metric
                            std_out(
                                f'Metric {pollutant} not in blueprint, ignoring.',
                                'WARNING')
                        else:
                            # Simply fill it up
                            std_out(
                                f'{pollutant} found in blueprint metrics, filling up with hardware info'
                            )
                            self.metrics[pollutant]['kwargs']['we'] = wen
                            self.metrics[pollutant]['kwargs']['ae'] = aen
                            self.metrics[pollutant]['kwargs'][
                                'location'] = self.location
                            self.metrics[pollutant]['kwargs'][
                                'alphasense_id'] = str(sensor_id)
                            self.metrics[pollutant]['kwargs'][
                                'from_date'] = from_date
                            self.metrics[pollutant]['kwargs'][
                                'to_date'] = to_date

                    # Other metric types will go here
            else:
                std_out(
                    'No hardware versions found, ignoring additional metrics',
                    'WARNING')

    def __check_sensors__(self):
        remove_sensors = list()
        for sensor in self.sensors:
            if sensor not in self.readings.columns:
                remove_sensors.append(sensor)

        if remove_sensors != []:
            std_out(f'Removing sensors from device: {remove_sensors}',
                    'WARNING')
        for sensor_to_remove in remove_sensors:
            self.sensors.pop(sensor_to_remove, None)

        std_out(f'Device sensors after removal: {list(self.sensors.keys())}')

    def __convert_names__(self):
        rename = dict()
        for sensor in self.sensors:
            if 'id' in self.sensors[sensor] and sensor in self.readings.columns:
                rename[self.sensors[sensor]['id']] = sensor
        self.readings.rename(columns=rename, inplace=True)

    def __convert_units__(self):
        '''
            Convert the units based on the UNIT_LUT and blueprint
            NB: what is read/written from/to the cache is not converted.
            The files are with original units, and then converted in the device only
            for the readings but never chached like so.
        '''
        std_out('Checking if units need to be converted')
        for sensor in self.sensors:
            factor = get_units_convf(sensor,
                                     from_units=self.sensors[sensor]['units'])

            if factor != 1:
                self.readings.rename(columns={sensor: sensor + '_RAW'},
                                     inplace=True)
                self.readings.loc[:,
                                  sensor] = self.readings.loc[:, sensor +
                                                              '_RAW'] * factor
        std_out('Units check done', 'SUCCESS')

    def process(self, only_new=False, lmetrics=None):
        '''
        Processes devices metrics, either added by the blueprint definition
        or the addition using Device.add_metric(). See help(Device.add_metric) for
        more information about the definition of the metrics to be added

        Parameters
        ----------
        only_new: boolean
            False
            To process or not the existing channels in the Device.readings that are
            defined in Device.metrics
        lmetrics: list
            None
            List of metrics to process. If none, processes all
        Returns
        ----------
            boolean
            True if processed ok, False otherwise
        '''

        process_ok = True

        if 'metrics' not in vars(self):
            std_out(f'Device {self.id} has nothing to process. Skipping',
                    'WARNING')
            return process_ok

        std_out('---------------------------')
        std_out(f'Processing device {self.id}')

        if lmetrics is None: metrics = self.metrics
        else: metrics = dict([(key, self.metrics[key]) for key in lmetrics])

        for metric in metrics:
            std_out(f'Processing {metric}')

            if only_new and metric in self.readings:
                std_out(f'Skipping. Already in device')
                continue

            # Check if the metric contains a custom from_list
            if 'from_list' in metrics[metric]:
                lazy_name = metrics[metric]['from_list']
            else:
                lazy_name = f"scdata.device.process.{metrics[metric]['process']}"

            try:
                funct = LazyCallable(lazy_name)
            except ModuleNotFoundError:
                print_exc()
                process_ok &= False
                std_out('Problem adding lazy callable to metrics list',
                        'ERROR')
                pass
                return False

            args, kwargs = list(), dict()
            if 'args' in metrics[metric]: args = metrics[metric]['args']
            if 'kwargs' in metrics[metric]: kwargs = metrics[metric]['kwargs']

            try:
                self.readings[metric] = funct(self.readings, *args, **kwargs)
            except KeyError:
                # print_exc()
                std_out('Metric args not in dataframe', 'ERROR')
                process_ok = False
                pass

            if metric in self.readings: process_ok &= True

        if process_ok:
            # Latest postprocessing to latest readings
            if self.api_device.get_device_postprocessing() is not None:
                std_out('Updating postprocessing')
                # Add latest postprocessing rounded up with frequency so that we don't end up in
                # and endless loop processing only the latest data line (minute vs. second precission of the readings)
                latest_postprocessing = localise_date(
                    self.readings.index[-1] +
                    to_timedelta(self.options['frequency']),
                    'UTC').strftime('%Y-%m-%dT%H:%M:%S')
                self.api_device.postprocessing[
                    'latest_postprocessing'] = latest_postprocessing

                std_out(f"{self.api_device.postprocessing}")
                std_out(f"Device {self.id} processed", "SUCCESS")

        self.processed = process_ok

        return process_ok

    def forward(self, chunk_size=500, dry_run=False):
        '''
            Forwards data to another api
                Parameters
                ----------
                chunk_size: int
                    500
                    Chunk size to be sent to device.post_data_to_device in question
                dry_run: boolean
                    False
                    Post the payload to the API or just return it
            Returns
            ----------
                boolean
                True if posted ok, False otherwise
        '''

        if self.forwarding_params is None:
            std_out('Empty forwarding information', 'ERROR')
            return False

        rd = dict()
        df = self.readings.copy().dropna(axis=0, how='all')

        df.rename(columns=rd, inplace=True)

        if df.empty:
            std_out('Empty dataframe, ignoring', 'WARNING')
            return False

        # Import requested handler
        hmod = __import__('scdata.io.device_api', fromlist=['io.device_api'])
        Hclass = getattr(hmod,
                         config.connectors[self.forwarding_request]['handler'])

        # Create object
        device = Hclass(did=self.forwarding_params)
        post_ok = device.post_data_to_device(df,
                                             chunk_size=chunk_size,
                                             dry_run=dry_run)
        if post_ok: std_out(f'Posted data for {self.id}', 'SUCCESS')
        else: std_out(f'Error posting data for {self.id}', 'ERROR')

        return post_ok

    def add_metric(self, metric=dict()):
        '''
        Add a metric to the device to be processed by a callable function
        Parameters
        ----------
            metric: dict
            Empty dict
            Description of the metric to be added. It only adds it to
            Device.metrics, but does not calculate anything yet. The metric dict needs 
            to follow the format:
                metric = {
                            'metric_name': {'process': <function_name>
                                            'args': <iterable>
                                            'kwargs': <**kwargs for @function_name>
                                            'from_list': <module to load function from>
                            }
                }
            The 'from_list' parameter is optional, and onle needed if the process is not 
            already available in scdata.device.process.

            For a list of available processes call help(scdata.device.process)

            Example:
            --------
                metric = {'NO2_CLEAN': {'process': 'clean_ts',
                                        'kwargs': {'name': pollutant,
                                                   'limits': [0, 350],
                                                    'window_size': 5}
                        }}
        Returns
        ----------
        True if added metric
        '''

        if 'metrics' not in vars(self):
            std_out(f'Device {self.id} has no metrics yet. Adding')
            self.metrics = dict()

        try:
            metricn = next(iter(metric.keys()))
            self.metrics[metricn] = metric[metricn]
        except:
            print_exc()
            return False

        std_out(f'Metric {metric} added to metrics', 'SUCCESS')
        return True

    def del_metric(self, metricn=''):
        if 'metrics' not in vars(self): return
        if metricn in self.metrics: self.metrics.pop(metricn, None)
        if metricn in self.readings.columns: self.readings.__delitem__(metricn)

        if metricn not in self.readings and metricn not in self.metrics:
            std_out(f'Metric {metricn} removed from metrics', 'SUCCESS')
            return True
        return False

    def export(self, path, forced_overwrite=False, file_format='csv'):
        '''
        Exports Device.readings to file
        Parameters
        ----------
            path: String
                Path to export file to, does not include filename.
                The filename will be the Device.id property
            forced_overwrite: boolean
                False
                Force data export in case of already existing file
            file_format: String
                'csv'
                File format to export. Current supported format CSV
        Returns
        ---------
            True if exported ok, False otherwise
        '''
        # Export device
        if file_format == 'csv':
            return export_csv_file(path,
                                   str(self.id),
                                   self.readings,
                                   forced_overwrite=forced_overwrite)
        else:
            std_out('Not supported format', 'ERROR')
            return False

    def post_sensors(self, dry_run=False):
        '''
        Posts devices sensors. Only available for parent of ScApiDevice
            Parameters
            ----------
            dry_run: boolean
                False
                Post the payload to the API or just return it
        Returns
        ----------
            boolean
            True if posted ok, False otherwise
        '''

        post_ok = True
        if self.sources[self.source]['handler'] != 'ScApiDevice':
            std_out('Only supported processing post is to SmartCitizen API',
                    'ERROR')
            return False

        rd = dict()
        df = self.readings.copy().dropna(axis=0, how='all')
        for col in self.readings:
            rd[col] = self.sensors[col]['id']

        df.rename(columns=rd, inplace=True)

        if df.empty:
            std_out('Empty dataframe, ignoring', 'WARNING')
            return False

        post_ok = self.api_device.post_data_to_device(df, dry_run=dry_run)
        if post_ok: std_out(f'Posted data for {self.id}', 'SUCCESS')
        else: std_out(f'Error posting data for {self.id}', 'ERROR')

        return post_ok

    def update_postprocessing(self, dry_run=False):
        '''
        Posts device postprocessing. Only available for parent of ScApiDevice
            Parameters
            ----------
            dry_run: boolean
                False
                Post the payload to the API or just return it
        Returns
        ----------
            boolean
            True if posted ok, False otherwise
        '''

        post_ok = self.api_device.patch_postprocessing(dry_run=dry_run)

        if post_ok:
            std_out(f"Postprocessing posted for device {self.id}", "SUCCESS")
        return post_ok

    def post_metrics(self, with_postprocessing=False, dry_run=False):
        '''
        Posts devices metrics. Only available for parent of ScApiDevice
        Parameters
        ----------
            with_postprocessing: boolean
                False
                Post the postprocessing_attributes too
            dry_run: boolean
                False
                Post the payload to the API or just return it
        Returns
        ----------
            boolean
            True if posted ok, False otherwise
        '''

        post_ok = True
        if self.sources[self.source]['handler'] != 'ScApiDevice':
            std_out('Only supported processing post is to SmartCitizen API',
                    'ERROR')
            return False

        rd = dict()
        std_out(f"Posting metrics for device {self.id}")
        # Make a copy of df
        df = self.readings.copy().dropna(axis=0, how='all')
        # Get metrics to post, only the ones that have True in 'post' field and a valid ID
        for metric in self.metrics:
            if self.metrics[metric]['post'] == True:
                std_out(f"Adding {metric} for device {self.id}")
                rd[col] = self.metrics[metric]['id']

        # Keep only metrics in df
        df = df[df.columns.intersection(list(rd.keys()))]
        df.rename(columns=rd, inplace=True)

        # If empty, avoid
        if df.empty:
            std_out('Empty dataframe, ignoring', 'WARNING')
            return False

        post_ok = self.api_device.post_data_to_device(df, dry_run=dry_run)
        if post_ok: std_out(f'Posted metrics for {self.id}', 'SUCCESS')
        else: std_out(f'Error posting metrics for {self.id}', 'ERROR')

        # Post info if requested. It should be updated elsewhere
        if with_postprocessing and post_ok:
            post_ok &= self.update_postprocessing(dry_run=dry_run)

        if post_ok: std_out(f"Metrics posted for device {self.id}", "SUCCESS")
        return post_ok
Exemple #60
0
    def get_all_econ_info(self, year):
        print("{} start!".format(year))

        # 추가 된 코드
        # 크롤링 중간에 꺼졌을 때 이어받기 위해 설정
        latest_index = self._get_latest_index(year)

        num = len(self.stock_item_all)  # print 용 변수
        count = latest_index  # print 용 변수
        for stock_code in self.stock_item_all[latest_index:]:
            count += 1
            code = stock_code[0]
            code_name = stock_code[1]
            print("{} 년도 ++++++++++++++ {} ++++++++++++++ {} / {}".format(year, code_name, count, num))

            # 데이터가 있으면 넘어가는 로직
            if self.is_exist_data(year, code):
                print("{} 년도 {} 데이터는 이미 존재한다!".format(year, code_name))
                continue

            print("insert {} {}".format(year, code_name))
            corp_class = self.crp_list.find_by_stock_code(code)

            # corp_class 가 none값 인지 확인
            if corp_class:
                corp_code = corp_class.to_dict()['corp_code']
            else:
                print("{} 의 corp_class 데이터가 조회되지 않습니다.".format(code_name))
                continue
            try:
                """
                dart.api.finance.get_single_corp(corp_code: str, bsns_year: str, reprt_code: str)

                corp_code: corp_code(종목코드가 아님, 공시대상회사의 고유번호(8자리)),
                bsns_year: 연도를(사업연도(4자리))
                reprt_code:
                    1분기보고서 : 11013, 반기보고서 : 110123, 3분기보고서 : 11014, 사업보고서 : 11011
                """
                res = dart.api.finance.get_single_corp(corp_code, str(year), '11011')

            except NoDataReceived as e:
                print("{} 년도 {} ({}) 데이터는 조회되지 않습니다.".format(year, code_name, code))
                continue
            df = DataFrame(res['list'])

            # 콤마 제거
            # 당해 연도
            df['thstrm_amount'] = df['thstrm_amount'].str.replace(',', '')
            # 1년 전
            df['frmtrm_amount'] = df['frmtrm_amount'].str.replace(',', '')
            # 2년 전
            df['bfefrmtrm_amount'] = df['bfefrmtrm_amount'].str.replace(',', '')

            # - 가 있는 컬럼은 None으로 변경
            df.loc[df.thstrm_amount == '-', 'thstrm_amount'] = None
            df.loc[df.frmtrm_amount == '-', 'frmtrm_amount'] = None
            df.loc[df.bfefrmtrm_amount == '-', 'bfefrmtrm_amount'] = None

            # stock_code 라는 컬럼명을 code로 변경
            df = df.rename(columns={'stock_code': 'code'})

            # 데이터프레임에 코드명 컬럼이 없어서 새롭게 생성
            df['code_name'] = code_name
            df.to_sql('dart', self.db_engine, if_exists='append',
                      dtype={
                          'thstrm_amount': sqlalchemy.types.BIGINT,
                          'frmtrm_amount': sqlalchemy.types.BIGINT,
                          'bfefrmtrm_amount': sqlalchemy.types.BIGINT
                      }
                      )