def test_ambiguous_warns(self): df = DataFrame({"A": [1, 2]}) with tm.assert_produces_warning(FutureWarning): df.rename(id, id) with tm.assert_produces_warning(FutureWarning): df.rename({0: 10}, {"A": "B"})
class Rename(object): def setup(self): N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.dict_idx = {k: k for k in self.idx} self.df2 = DataFrame( {c: {0: np.random.randint(0, 2, N).astype(np.bool_), 1: np.random.randint(0, N, N).astype(np.int16), 2: np.random.randint(0, N, N).astype(np.int32), 3: np.random.randint(0, N, N).astype(np.int64)} [np.random.randint(0, 4)] for c in range(N)}) def time_rename_single(self): self.df.rename({0: 0}) def time_rename_axis0(self): self.df.rename(self.dict_idx) def time_rename_axis1(self): self.df.rename(columns=self.dict_idx) def time_rename_both_axes(self): self.df.rename(index=self.dict_idx, columns=self.dict_idx) def time_dict_rename_both_axes(self): self.df.rename(index=self.dict_idx, columns=self.dict_idx)
def trialToOneRow(dfs, lonumberfields, oldnewsamsecnamepairs): collapsedFields = {field: [dfs[field].values] for field in lonumberfields} collapsedTrial = DataFrame(collapsedFields) collapsedTrial.rename(columns=oldnewsamsecnamepairs, inplace=True) return collapsedTrial
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr): ''' Meant to feed into a Pivot requested by Mitch Turner. Aggregates the same as above but includes time and product data. ''' dat = pwunsale_tidy['Date'].tolist() pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat] print('Aggregating custom pivot for Mitch.') len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } custom_cols = ['Month','CustomerId','Customer','ProductId','Product'] customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) return customer_returns
def featureize(df, vectorizers, verbose=False): """ Featurize an enhanced http dataframe Parameters ---------- df : dataframe The enhanced HTTP log dataframe vectorizers : {String -> TfidfVectorizer} A map of feature -> vectorizer verbose: boolean, optional Controls Verbosity level Returns ------- featureMatrix : numeric dataframe A featurized dataframe """ if verbose: print('\nExtracting features') bow_features = [] #featurize using the vectorizers. for feature in ['user_agent','uri','referrer','host', 'subdomain', 'method','status_code','resp_p_str', 'URIparams', 'browser_string', 'tld']: if verbose: print('Featurizing %s' % feature) single_feature_matrix = vectorizers[feature].transform(df[feature].astype(str)) if verbose: print(' Dim of %s: %s' % (feature,single_feature_matrix.shape[1])) single_df = DataFrame(single_feature_matrix.toarray()) single_df.rename(columns=lambda x: feature+"."+vectorizers[feature].get_feature_names()[x], inplace=True) bow_features.append(single_df) featureMatrix = pd.concat(bow_features, axis=1) #add some other numeric features that are functions of columns featureMatrix['domainNameLength'] = df['host'].apply(len) featureMatrix['domainNameDots'] = df['host'].apply(lambda dn: dn.count('.')) featureMatrix['uriSlashes'] = df['uri'].apply(lambda dn: dn.count('/')) featureMatrix['userAgentLength'] = df['user_agent'].apply(len) featureMatrix['userAgentEntropy'] = df['user_agent'].apply(H) featureMatrix['subdomainEntropy'] = df['subdomain'].apply(H) featureMatrix['request_body_len'] = df['request_body_len'] featureMatrix['response_body_len'] = df['response_body_len'] featureMatrix['referrerPresent'] = df['referrer'].apply(lambda r: 0.0 if (r=='-') else 1.0) def countParams(uri): fullUri = 'http://bogus.com/'+uri parseResult = parse_qs(urlparse(fullUri).query) return len(parseResult) featureMatrix['numURIParams'] = df['uri'].apply(countParams) featureMatrix['URIParamsKeyEntropy'] = df['URIparams'].apply(H) featureMatrix['URIParamsTokensEntropy'] = df['URItokens'].apply(H) if verbose: print('Feature matrix generated with %s columns' % featureMatrix.shape[1]) return featureMatrix
def change_axis01(): data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four']) print data print data.index.map(str.upper) data.index=data.index.map(str.upper) print data print data.rename(index=str.title,columns=str.upper) print data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'},inplace=True) print data
def imputation_loyer(year): erf = create_comparable_erf_data_frame(year) erf = erf[['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident']] erf = erf.dropna(how = 'any') # TODO : faire un check avant de dropper les lignes avec des NA Logt = create_comparable_logement_data_frame(year) Logt = Logt.dropna(how = 'any') allvars = ['logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci'] classes = ['magtr', 'tu99_recoded'] matchvars = list(set(allvars) - set(classes)) for variable in allvars: count_NA(variable, Logt) count_NA(variable, erf) erf['mcs8'] = erf['mcs8'].astype(int) rpy2.robjects.pandas2ri.activate() # Permet à rpy2 de convertir les dataframes padas2ri doesn't exist anymore in rpy2 # com.convert_to_r_dataframe() TODO: Probablement à supprimer try: sm = importr("StatMatch") # Launch R you need to have StatMatch installed in R except: sm = importr("StatMatch", lib_loc = STATMATCH_LIB_LOCATION) out_nnd = sm.NND_hotdeck(data_rec = erf, data_don = Logt, match_vars = vectors.StrVector(matchvars), don_class = vectors.StrVector(classes), dist_fun = "Gower", ) fill_erf_nnd = sm.create_fused(data_rec = erf, data_don = Logt, mtc_ids = out_nnd[0], z_vars = vectors.StrVector(["lmlm"]), ) del allvars, matchvars, classes, out_nnd gc.collect() fill_erf_nnd = com.convert_robj(fill_erf_nnd) fill_erf_nnd = DataFrame(fill_erf_nnd) fill_erf_nnd.rename(columns={'lmlm': 'loym'}, inplace = True) loy_imput = fill_erf_nnd[['ident', 'loym']] erfmenm = load_temp(name = "menagem", year = year) for var in ["loym", "loym_x", "loym_y", "loym_z"]: if var in erfmenm: del erfmenm[var] log.info("{} have been deleted".format(var)) erfmenm = erfmenm.merge(loy_imput, on='ident', how='left') assert 'loym' in erfmenm.columns, u"La variable loym n'est pas présente dans erfmenm" save_temp(erfmenm, name = "menagem", year=year)
def test_rename_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y']) expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y']) result = df.rename(str.lower, axis=1) tm.assert_frame_equal(result, expected) result = df.rename(str.lower, axis='columns') tm.assert_frame_equal(result, expected) result = df.rename({"A": 'a', 'B': 'b'}, axis=1) tm.assert_frame_equal(result, expected) result = df.rename({"A": 'a', 'B': 'b'}, axis='columns') tm.assert_frame_equal(result, expected) # Index expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y']) result = df.rename(str.lower, axis=0) tm.assert_frame_equal(result, expected) result = df.rename(str.lower, axis='index') tm.assert_frame_equal(result, expected) result = df.rename({'X': 'x', 'Y': 'y'}, axis=0) tm.assert_frame_equal(result, expected) result = df.rename({'X': 'x', 'Y': 'y'}, axis='index') tm.assert_frame_equal(result, expected) result = df.rename(mapper=str.lower, axis='index') tm.assert_frame_equal(result, expected)
def slide_21(): import json db = json.load(open(FOODJSONPATH)) print len(db) print db[0].keys() print db[0]['nutrients'][0] nutrients = DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys = ['description', 'group', 'id', 'manufacturer'] info = DataFrame(db, columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] print "今から全部のnutrientsを扱うよ" nutrients = [] for rec in db: fnuts = DataFrame(rec['nutrients']) fnuts['id'] = rec['id'] nutrients.append(fnuts) nutrients = pd.concat(nutrients, ignore_index=True) print "なんか重複多い" print nutrients.duplicated().sum() nutrients = nutrients.drop_duplicates() print "infoとnutrients両方にdescriptionとgroupがあるから変えよう" col_mapping = {'description': 'food', 'group': 'fgroup'} info = info.rename(columns=col_mapping, copy=False) col_mapping = {'description': 'nutrient', 'group': 'nutgroup'} nutrients = nutrients.rename(columns=col_mapping, copy=False) ndata = pd.merge(nutrients, info, on='id', how='outer') print ndata.ix[30000] result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5) result['Zinc, Zn'].order().plot(kind='barh') plt.show() by_nutrient = ndata.groupby(['nutgroup', 'nutrient']) get_maximum = lambda x: x.xs(x.value.idxmax()) get_minimum = lambda x: x.xs(x.value.idxmin()) max_foods = by_nutrient.apply(get_maximum)[['value', 'food']] max_foods.food = max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def append_village_areas(divname): im_vil = pd.read_csv('../data/%s_village_images.csv' % divname.lower()) shape_helper = ShapeHelper('../data/shapefiles/fixed_village_shapefiles/%s/%s.shp' % (divname.lower(), divname.lower()), lat_offset, lon_offset) areas = shape_helper.get_shape_areas('village') areas_df = DataFrame(areas, index=['area']) areas_df = areas_df.transpose() areas_df.reset_index(inplace=True) areas_df.rename(columns={'index': 'village'}, inplace=True) im_vil_areas = pd.merge(im_vil, areas_df, how='left') im_vil_areas.set_index('image', inplace=True) im_vil_areas.to_csv('../data/%s_village_areas_images.csv' % divname.lower())
def obs_for_station_as_df(self, station_id, datatype, limit): obs_results = self.obs_for_station_q(station_id, datatype, limit) if not FbStationsApi.check_ok(obs_results, "obs_for_station_q"): return None obs_data = obs_results.json()['results'] if bool(obs_data): obs_results_df = DataFrame(obs_data) obs_results_df.rename(columns = {'value' : datatype}, inplace=True) obs_results_df['date'] = obs_results_df['date'].astype('datetime64[ns]') return obs_results_df else: return None
def fix_tickets( self, ticket_frame: pd.DataFrame, path_fixes) -> pd.DataFrame: ticket_frame.rename( columns={'Total changed lines': 'ChangedLines'}, inplace=True) ticket_frame = ticket_frame[ ticket_frame.ChangedLines < 100000] ticket_frame = ticket_frame.assign( ChangedFiles=ticket_frame['Changed files'].apply( partial(self.fix_path_prefixes, path_fixes))) fixed_frame = ticket_frame.drop( 'Changed files', axis=1).sort_values( by='CommitDate').reset_index(drop=True) fixed_frame.fillna(value={'Found': ''}, axis=0, inplace=True) return fixed_frame
def forecast_as_df(self, lat, lon, datatype, limit): forecast_results = self.forecast_q(lat, lon, limit) if not FbForecastApi.check_ok(forecast_results, "forecast_q"): return None forecast_data = forecast_results.json()['results'] if bool(forecast_data): forecast_results_df = DataFrame(forecast_data) forecast_results_df.rename(columns = {'value' : datatype}, inplace=True) forecast_results_df['forecast_date'] = forecast_results_df['forecast_date'].astype('datetime64[ns]') forecast_results_df['model_date'] = forecast_results_df['model_date'].astype('datetime64[ns]') return forecast_results_df else: return None
def get_cpu_sw_map(dfds, cap_time_usec, task_re): df_list = [] dfsw_list = [] for dfd in dfds: df = filter_df_core(dfd.df, task_re, True) # at this point we have a set of df that look like this: # task_name duration # 0 ASA.1.vcpu0 7954 # 1 ASA.1.vcpu0 5475 # 2 ASA.1.vcpu0 4151 if df.empty: continue gb = df.groupby("task_name", as_index=False) # sum all duration for each task df = gb.aggregate(np.sum) if dfd.multiplier > 1.0: df["duration"] = (df["duration"] * dfd.multiplier).astype(int) df["percent"] = ((df["duration"] * 100 * 10) // cap_time_usec) / 10 if len(dfds) > 1: df["task_name"] = df["task_name"] + "." + dfd.short_name df_list.append(df) # count number of rows with same task and cpu dfsw = DataFrame(gb.size()) dfsw.reset_index(inplace=True) dfsw.rename(columns={0: "count"}, inplace=True) if dfd.multiplier > 1.0: dfsw["count"] = (dfsw["count"] * dfd.multiplier).astype(int) else: dfsw["count"] = dfsw["count"].astype(int) dfsw_list.append(dfsw) if not df_list: return None df = pandas.concat(df_list) df = df.drop("duration", axis=1) dfsw = pandas.concat(dfsw_list) df = pandas.merge(df, dfsw, on="task_name") # Result: # task_name percent count # 0 ASA.01.vcpu0.1x218 72.0 1998 # 1 ASA.01.vcpu0.2x208 61.8 2128 # 2 ASA.02.vcpu0.2x208 58.9 2177 # transform this into a dict where the key is the task_name and the value # is a list [percent, count] return df.set_index("task_name").T.to_dict("list")
def test_rename(self): mapping = { 'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd' } renamed = self.frame.rename(columns=mapping) renamed2 = self.frame.rename(columns=str.lower) assert_frame_equal(renamed, renamed2) assert_frame_equal(renamed2.rename(columns=str.upper), self.frame, check_names=False) # index data = { 'A': {'foo': 0, 'bar': 1} } # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) tm.assert_index_equal(renamed.index, pd.Index(['foo', 'bar'])) renamed = df.rename(index=str.upper) tm.assert_index_equal(renamed.index, pd.Index(['BAR', 'FOO'])) # have to pass something pytest.raises(TypeError, self.frame.rename) # partial columns renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'}) tm.assert_index_equal(renamed.columns, pd.Index(['A', 'B', 'foo', 'bar'])) # other axis renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) tm.assert_index_equal(renamed.index, pd.Index(['A', 'B', 'foo', 'bar'])) # index with name index = Index(['foo', 'bar'], name='name') renamer = DataFrame(data, index=index) renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) tm.assert_index_equal(renamed.index, pd.Index(['bar', 'foo'], name='name')) assert renamed.index.name == renamer.index.name
def test_convert_dummies(self): df = DataFrame( { "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), } ) with tm.assert_produces_warning(FutureWarning): result = convert_dummies(df, ["A", "B"]) result2 = convert_dummies(df, ["A", "B"], prefix_sep=".") expected = DataFrame( { "A_foo": [1, 0, 1, 0, 1, 0, 1, 1], "A_bar": [0, 1, 0, 1, 0, 1, 0, 0], "B_one": [1, 1, 0, 0, 0, 0, 1, 0], "B_two": [0, 0, 1, 0, 1, 1, 0, 0], "B_three": [0, 0, 0, 1, 0, 0, 0, 1], "C": df["C"].values, "D": df["D"].values, }, columns=result.columns, dtype=float, ) expected2 = expected.rename(columns=lambda x: x.replace("_", ".")) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected2)
def merge_by_year(year): "merge the two dataframe based on Country" data1 = Df(transIncome.ix[year]) data1 = data1.rename(columns = {year:'Income'}) data1['Country'] = transIncome.columns mergedData = pd.merge(data1,countries,on = ['Country']) return mergedData
def group_by_time_transform(self): dp_df_columns = ['data_date','indicator_id','location_id','value'] time_grouping = self.parsed_params['group_by_time'] # HACKK if self.parsed_params['chart_uuid'] ==\ '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be': self.parsed_params['show_missing_data'] = 1 return self.handle_polio_case_table(dp_df_columns) cols = ['data_date','indicator_id','location_id','value'] dp_df = DataFrame(list(DataPoint.objects.filter( location_id__in = self.location_ids, indicator_id__in = self.parsed_params['indicator__in'] ).values(*cols)),columns=cols) if not dp_df.empty: dp_df = self.get_time_group_series(dp_df) gb_df = DataFrame(dp_df\ .groupby(['indicator_id','time_grouping','location_id'])['value']\ .sum())\ .reset_index() return gb_df # need to look at sublocations if the data isn't available at the current level else: depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids while dp_df.empty and depth_level < max_depth: sub_location_ids = Location.objects\ .filter(parent_location_id__in=sub_location_ids)\ .values_list('id', flat=True) dp_df = DataFrame(list(DataPoint.objects.filter( location_id__in = sub_location_ids, indicator_id__in = self.parsed_params['indicator__in'] ).values(*cols)),columns=cols) depth_level += 1 dp_df = self.get_time_group_series(dp_df) if dp_df.empty: return [] location_tree_df = DataFrame(list(LocationTree.objects\ .filter(location_id__in = sub_location_ids)\ .values_list('location_id','parent_location_id')),\ columns=['location_id','parent_location_id']) merged_df = dp_df.merge(location_tree_df) filtered_df = merged_df[merged_df['parent_location_id']\ .isin(self.location_ids)] gb_df = DataFrame(filtered_df\ .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\ .sum())\ .reset_index() gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'}) return gb_df
def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust): ''' Derives intelligence out of MTC1 data on customer returns. ''' print('*'*100) print('Creating summary of returns.') print('*'*100) len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } print('\n\n\nAggregating tidy dataset.') customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) print('Reorder columns for readability.\n\n\n') reorder_cols = ['CustomerId','Customer','Returns|count', 'PercentSales','DollarSales|bycustomer', 'DollarsReturned|sum','DollarsReturned|avg', 'CasesReturned|sum','CasesReturned|avg', 'OnPremise','Latitude','Longitude'] customer_returns = customer_returns[reorder_cols] print('*'*100) print('Finished summarizing returns.') print('*'*100) return customer_returns
def test_frame_describe_tupleindex(self): # GH 14848 - regression from 0.19.0 to 0.19.1 df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, 'y': [10, 20, 30, 40, 50] * 3, 'z': [100, 200, 300, 400, 500] * 3}) df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={'k': 'key'}) pytest.raises(ValueError, lambda: df1.groupby('k').describe()) pytest.raises(ValueError, lambda: df2.groupby('key').describe())
def test_rename_positional(self): df = DataFrame(columns=['A', 'B']) with tm.assert_produces_warning(FutureWarning) as rec: result = df.rename(None, str.lower) expected = DataFrame(columns=['a', 'b']) tm.assert_frame_equal(result, expected) assert len(rec) == 1 message = str(rec[0].message) assert 'rename' in message assert 'Use named arguments' in message
def test_insert_column_bug_4032(self): # GH4032, inserting a column and renaming causing errors df = DataFrame({'b': [1.1, 2.2]}) df = df.rename(columns={}) df.insert(0, 'a', [1, 2]) result = df.rename(columns={}) str(result) expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b']) assert_frame_equal(result, expected) df.insert(0, 'c', [1.3, 2.3]) result = df.rename(columns={}) str(result) expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=['c', 'a', 'b']) assert_frame_equal(result, expected)
def test_rename_bug2(self): # GH 19497 # rename was changing Index to MultiIndex if Index contained tuples df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"]) df = df.rename({(1, 1): (5, 4)}, axis="index") expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"]) assert_frame_equal(df, expected)
def nearestNeighborsSetup(filename, stateList): df_specimens = formatChecker(filename) print 'Getting the weather stations' with open('input/acis_station_ID.pickle') as f: weatherStationsMetaData = cPickle.load(f) # weatherStationsMetaData = weatherStations(stateList) # weatherStationsMetaData = read_csv('weatherStation/acis_station_ID.csv') df_stations = DataFrame.from_dict(weatherStationsMetaData, orient='index', dtype=None) '''Loads the lat/long coordinates of the specimens and weather stations into numpy arrays. NearestNeighborsResults() will return he number of K (nearest stations) with the index value. Then index will be replaced by the UID to match the ASIC data serve.''' #Number of points np1 = np.array(df_specimens['longitude']).size np2 = np.array(df_stations['longitude']).size #Search radius r = .25 #Number of nearest stations returned k = 10 d1 = np.empty((np1, 2)) d2 = np.empty((np2, 2)) d1[:, 0] = np.array(df_specimens['latitude']) d1[:, 1] = np.array(df_specimens['longitude']) d2[:, 0] = np.array(df_stations['latitude']) d2[:, 1] = np.array(df_stations['longitude']) result, distance = nearestNeighborsResults(d1.copy(), d2.copy(), r, k) columnindex = [] closestStationList = [nearestNeighborsColumnString(x) for x in range(k)] for f in closestStationList: columnindex.append(f()), #temp variable for 0-N array t1 = np.arange(np2) #temp variable for 'uid' ID t2 = np.array(df_stations['uid']) df_results = DataFrame(result, columns=columnindex) #Creates a Pandas DataFrame uid_index = DataFrame({'0_closest_weather_station': t1, 'uid': t2}) for index, column_name in enumerate(columnindex): temp = uid_index.rename(columns={'0_closest_weather_station': column_name, 'uid': column_name + "s"}) df_results = df_results.reset_index().merge(temp, how='left', on= column_name, sort=False).sort('index') if index != 0: del df_results['level_0'] del df_results[column_name] del df_results['index'] df_results = df_results.reset_index() return concat([df_specimens, df_results], axis=1), distance, weatherStationsMetaData
def deal_string02(): import json db=json.load(open(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch07\\foods-2011-10-03.json')) print len(db) print db[0] print db[0].keys() print db[0]['nutrients'][0] nutrients=DataFrame(db[0]['nutrients']) print nutrients[:7] info_keys=['description','group','id','manufacturer'] info=DataFrame(db,columns=info_keys) print info[:5] print pd.value_counts(info.group)[:10] nutrients=[] for rec in db: fnuts=DataFrame(rec['nutrients']) fnuts['id']=rec['id'] nutrients.append(fnuts) nutrients=pd.concat(nutrients,ignore_index=True) print nutrients print nutrients.duplicated().sum() nutrients=nutrients.drop_duplicates() col_mapping={'description':'food','group':'fgroup'} info=info.rename(columns=col_mapping,copy=False) print info col_mapping={'description':'nutrient','group':'nutgroup'} nutrients=nutrients.rename(columns=col_mapping,copy=False) print nutrients ndata=pd.merge(nutrients,info,on='id',how='outer') print ndata print ndata.ix[3000] result=ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5) # print result result['Zinc, Zn'].sort_values().plot(kind='barh') by_nutrient=ndata.groupby(['nutgroup','nutrient']) get_maximum=lambda x:x.xs(x.value.idxmax()) get_minimum=lambda x:x.xs(x.value.idmin()) max_foods=by_nutrient.apply(get_maximum)[['value','food']] max_foods.food=max_foods.food.str[:50] print max_foods.ix['Amino Acids']['food']
def get_plate_data(path,c): """ Get plate data, drop empty columns, drop selected columns, rename columns, add normalized columns. """ return thread_first(path, from_file, (str.replace,'\r',''), StringIO, pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']), df.dropna(axis=1,how='all'), (drop_matching_columns,c['dropcols']), df.rename(columns=c['colrename']), (add_normalized_columns,c['normcols']))
def parse_data(self, articles): """ Responsible to parse articles in order to extract data. Data is extracted as a DataFrame containing the following columns: - Article metadata: only the metadata defined in self.metadata_column are extracted - Article tags: all tags are extracted, the name defined in self.tags_column are used to rename columns Data is indexed by a generated ID (integer). :param articles: The articles to parse. """ tags = [] metadata = [] # TODO not the more efficient way to do that I think. for article in articles: if hasattr(article, "tags"): # Extracting all tags name from an article and putting them in a Series tags.append( Series([tag.name for tag in article.tags], ["tag_" + str(x) for x in range(len(article.tags))]) ) # Selecting metadata, only the ones specified in the columns metadata.append( Series( dict([(i, article.metadata[i]) for i in self.metadata_columns if i in article.metadata]), self.metadata_columns, ) ) # Creating the tags DataFrame tags_data_frame = DataFrame(tags) # Renaming columns, leaving the remaining ones with the generated name "tag_" # Mapping current column names to the new ones in order to make a replacement if self.tag_columns is not None: replacement = dict(zip(tags_data_frame.columns.get_values()[: len(self.tag_columns)], self.tag_columns)) # Inplace means no copy tags_data_frame.rename(columns=replacement, inplace=True) # Creating the metadata DataFrame metadata_data_frame = DataFrame(metadata) # Replacing data in column category by its string value # TODO maybe a better way to do that, it seems a bit ugly metadata_data_frame["category"] = metadata_data_frame["category"].apply(lambda x: str(x)) # Merging the two DataFrame together self.data = metadata_data_frame.join(tags_data_frame)
def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2']) tm.assert_frame_equal(table, table2, check_names=False)
def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, 'y': [10, 20, 30, 40, 50] * 3, 'z': [100, 200, 300, 400, 500] * 3}) df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={'k': 'key'}) msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): df1.groupby('k').describe() with pytest.raises(ValueError, match=msg): df2.groupby('key').describe()
def get_results_dataframe(self, default = False, index_by_code = False): ''' Formats data into a dataframe ''' datas = self._compute() self._compute_uc() uc = self.uc dfs = dict() for scenario, dico in datas.iteritems(): data = dico['data'] data_default = dico['default'] data_dict = dict() index = [] if default is True: data = data_default for row in data: if not row.desc in ('root'): if row.code == 'revdisp': revdisp = row.vals if index_by_code is True: index.append(row.code) data_dict[row.code] = row.vals else: index.append(row.desc) data_dict[row.desc] = row.vals df = DataFrame(data_dict).T df = df.reindex(index) df = df.rename(columns = {0: scenario}) nivvie = revdisp/uc[scenario] # TODO: include savings !! df = concat([df, DataFrame({scenario: nivvie}, index=['nivvie']) ]) dfs[scenario] = df first = True for df in dfs.itervalues(): if first: df_final = df first = False else: df_final = concat([df_final, df], axis=1, join ="inner") return df_final
print(df) # max([1,2,10]) # df['score2'] = df['C'].map(max) # 运行这个会报错! # transform()和map()使用方法是一样的---了解即可上面是关键 df['score2'] = df['C'].transform(mp) print(df) df['C'] = df['C'].map(lambda x: x * 2) # dataframe更新列‘C’,后面是隐射条件---匿名函数 print(df) # 2.3 rename()函数:替换行索引 # 2.3.1 第一次改动 inds = {'张三': 'Zhang Sir', '木兰': 'MissLan'} # 设置键值对用于替换索引 df.rename(index=inds) # 2.3.2 替换列索引 def cols(x): if x == 'PHP': return 'php' if x == 'Python': return '大蟒蛇' else: return x inds = {'张三': 'Zhang Sir', '木兰': 'MissLan'} df.rename(index=inds, columns=cols, inplace=True) # index = inds更改索引名称,columns = cols更改列名称
def _read_one_data(self, url, params): """ read one data from specified symbol """ symbol = params["symbol"] del params["symbol"] url = url.format(symbol) resp = self._get_response(url, params=params) ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);" try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"] except KeyError: msg = "No data fetched for symbol {} using {}" raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) # price data prices = DataFrame(data["prices"]) prices.columns = [col.capitalize() for col in prices.columns] prices["Date"] = to_datetime(to_datetime(prices["Date"], unit="s").dt.date) if "Data" in prices.columns: prices = prices[prices["Data"].isnull()] prices = prices[["Date", "High", "Low", "Open", "Close", "Volume", "Adjclose"]] prices = prices.rename(columns={"Adjclose": "Adj Close"}) prices = prices.set_index("Date") prices = prices.sort_index().dropna(how="all") if self.ret_index: prices["Ret_Index"] = _calc_return_index(prices["Adj Close"]) if self.adjust_price: prices = _adjust_prices(prices) # dividends & splits data if self.get_actions and data["eventsData"]: actions = DataFrame(data["eventsData"]) actions.columns = [col.capitalize() for col in actions.columns] actions["Date"] = to_datetime( to_datetime(actions["Date"], unit="s").dt.date ) types = actions["Type"].unique() if "DIVIDEND" in types: divs = actions[actions.Type == "DIVIDEND"].copy() divs = divs[["Date", "Amount"]].reset_index(drop=True) divs = divs.set_index("Date") divs = divs.rename(columns={"Amount": "Dividends"}) prices = prices.join(divs, how="outer") if "SPLIT" in types: def split_ratio(row): if float(row["Numerator"]) > 0: return eval(row["Splitratio"]) else: return 1 splits = actions[actions.Type == "SPLIT"].copy() splits["SplitRatio"] = splits.apply(split_ratio, axis=1) splits = splits.reset_index(drop=True) splits = splits.set_index("Date") splits["Splits"] = splits["SplitRatio"] prices = prices.join(splits["Splits"], how="outer") if "DIVIDEND" in types and not self.adjust_dividends: # dividends are adjusted automatically by Yahoo adj = ( prices["Splits"].sort_index(ascending=False).fillna(1).cumprod() ) prices["Dividends"] = prices["Dividends"] / adj return prices
def test_rename_multiindex(self): tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) columns = MultiIndex.from_tuples(tuples_columns, names=['fizz', 'buzz']) df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) # # without specifying level -> accross all levels renamed = df.rename(index={ 'foo1': 'foo3', 'bar2': 'bar3' }, columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }) new_index = MultiIndex.from_tuples([('foo3', 'bar1'), ('foo2', 'bar3')], names=['foo', 'bar']) new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), ('fizz2', 'buzz3')], names=['fizz', 'buzz']) tm.assert_index_equal(renamed.index, new_index) tm.assert_index_equal(renamed.columns, new_columns) assert renamed.index.names == df.index.names assert renamed.columns.names == df.columns.names # # with specifying a level (GH13766) # dict new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), ('fizz2', 'buzz2')], names=['fizz', 'buzz']) renamed = df.rename(columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }, level=0) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }, level='fizz') tm.assert_index_equal(renamed.columns, new_columns) new_columns = MultiIndex.from_tuples([('fizz1', 'buzz1'), ('fizz2', 'buzz3')], names=['fizz', 'buzz']) renamed = df.rename(columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }, level=1) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }, level='buzz') tm.assert_index_equal(renamed.columns, new_columns) # function func = str.upper new_columns = MultiIndex.from_tuples([('FIZZ1', 'buzz1'), ('FIZZ2', 'buzz2')], names=['fizz', 'buzz']) renamed = df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns=func, level='fizz') tm.assert_index_equal(renamed.columns, new_columns) new_columns = MultiIndex.from_tuples([('fizz1', 'BUZZ1'), ('fizz2', 'BUZZ2')], names=['fizz', 'buzz']) renamed = df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns=func, level='buzz') tm.assert_index_equal(renamed.columns, new_columns) # index new_index = MultiIndex.from_tuples([('foo3', 'bar1'), ('foo2', 'bar2')], names=['foo', 'bar']) renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) tm.assert_index_equal(renamed.index, new_index)
# 데이터 전처리 # '년도'에 대한 컬럼만 리스트로 추출 year = list(df['year']) # print_df(year) # 빈 딕셔너리 생성 new_name = {} # '년도' 리스트에 대해 반복 for i, v in enumerate(year): new_name[i] = v # print(new_name) # 데이터 프레임의 인덱스 변경 df.rename(index=new_name, inplace=True) # 기존의 '년도' 컬럼은 삭제 df.drop('year', axis=1, inplace=True) # 컬럼명 변경 df.rename(columns={ 'car_vs_people': '차 대 사람', 'car_vs_car': '차 대 차', 'car_only': '차량 단독', }, inplace=True) print_df(df) # 전역 설정
def main(cfg: DictConfig) -> None: print(cfg) logger.info(f"The current working directory is {Path().cwd()}") start_time = time.time() logger.info("initializing experimental condition..") # compared ope estimators lambdas = list(dict(cfg.estimator_hyperparams)["lambdas"]) ope_estimators = [ DoublyRobustWithShrinkage(lambda_=lam_, estimator_name=f"DRos ({lam_})") for lam_ in lambdas ] + [ DoublyRobustWithShrinkageTuning(lambdas=lambdas, estimator_name="DRos (tuning)"), ] # configurations n_seeds = cfg.setting.n_seeds sample_size = cfg.setting.sample_size reg_model = cfg.setting.reg_model campaign = cfg.setting.campaign behavior_policy = cfg.setting.behavior_policy test_size = cfg.setting.test_size is_timeseries_split = cfg.setting.is_timeseries_split n_folds = cfg.setting.n_folds obd_path = (Path().cwd().parents[5] / "open_bandit_dataset" if cfg.setting.is_full_obd else None) random_state = cfg.setting.random_state np.random.seed(random_state) # define dataset dataset_ts = OpenBanditDataset(behavior_policy="bts", campaign=campaign, data_path=obd_path) dataset_ur = OpenBanditDataset(behavior_policy="random", campaign=campaign, data_path=obd_path) # prepare logged bandit feedback and evaluation policies if behavior_policy == "random": if is_timeseries_split: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ur = dataset_ur.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ur] # obtain the ground-truth policy value ground_truth_ts = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="bts", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ts = BernoulliTS( n_actions=dataset_ts.n_actions, len_list=dataset_ts.len_list, random_state=random_state, is_zozotown_prior=True, campaign=campaign, ) action_dist_ts = policy_ts.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ts, action_dist_ts)] else: if is_timeseries_split: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback( test_size=test_size, is_timeseries_split=True, )[0] else: bandit_feedback_ts = dataset_ts.obtain_batch_bandit_feedback() bandit_feedbacks = [bandit_feedback_ts] # obtain the ground-truth policy value ground_truth_ur = OpenBanditDataset.calc_on_policy_policy_value_estimate( behavior_policy="random", campaign=campaign, data_path=obd_path, test_size=test_size, is_timeseries_split=is_timeseries_split, ) # obtain action choice probabilities and define evaluation policies policy_ur = Random( n_actions=dataset_ur.n_actions, len_list=dataset_ur.len_list, random_state=random_state, ) action_dist_ur = policy_ur.compute_batch_action_dist(n_rounds=1000000) evaluation_policies = [(ground_truth_ur, action_dist_ur)] # regression models used in ope estimators hyperparams = dict(cfg.reg_model_hyperparams)[reg_model] regression_models = [reg_model_dict[reg_model](**hyperparams)] # define an evaluator class evaluator = InterpretableOPEEvaluator( random_states=np.arange(n_seeds), bandit_feedbacks=bandit_feedbacks, evaluation_policies=evaluation_policies, ope_estimators=ope_estimators, regression_models=regression_models, ) # conduct an evaluation of OPE experiment logger.info("experiment started") _ = evaluator.estimate_policy_value(sample_size=sample_size, n_folds_=n_folds) # calculate statistics mean = evaluator.calculate_mean(root=True) mean_scaled = evaluator.calculate_mean(scale=True, root=True) # save results of the evaluation of off-policy estimators log_path = Path("./outputs/hypara") log_path.mkdir(exist_ok=True, parents=True) # save root mse root_mse_df = DataFrame() root_mse_df["estimator"] = list(mean.keys()) root_mse_df["mean"] = list(mean.values()) root_mse_df["mean(scaled)"] = list(mean_scaled.values()) root_mse_df.to_csv(log_path / "root_mse.csv") # conduct pairwise t-tests se_df = DataFrame(evaluator.calculate_squared_error()) se_df = DataFrame(se_df.stack()).reset_index(1) se_df.rename(columns={"level_1": "estimators", 0: "se"}, inplace=True) nonparam_ttests = (pg.pairwise_ttests( data=se_df, dv="se", parametric=False, between="estimators", ).round(4).drop(["Contrast", "Parametric", "Paired"], axis=1)) nonparam_ttests.to_csv(log_path / "nonparam_ttests.csv") # print result print(root_mse_df) experiment = f"{campaign}-{behavior_policy}-{sample_size}" elapsed_time = np.round((time.time() - start_time) / 60, 2) logger.info(f"finish experiment {experiment} in {elapsed_time}min")
model = vecm.select_order(train_ecm, maxlags=8) print(model.summary()) # In[10]: pd.options.display.float_format = "{:.2f}".format """definition of det_orderint: -1 - no deterministic terms; 0 - constant term; 1 - linear trend""" pd.options.display.float_format = "{:.2f}".format model = coint_johansen(endog=train_ecm, det_order=1, k_ar_diff=3) print('Eigen statistic:') print(model.eig) print() print('Critical values:') d = DataFrame(model.cvt) d.rename(columns={0: '90%', 1: '95%', 2: '99%'}, inplace=True) print(d) print() print('Trace statistic:') print(DataFrame(model.lr1)) # Here, we see that trace statistics (76.86033757 44.90556245 24.43779121 11.11437692) are < critical values @95% (79.24, 55.24, 35.01, 18.39). # # Interpreting Johansen Cointegration Test Results # - output releases two statistics, Trace Statistic and Max-Eigen Statistic # - Rejection criteria is at 0.05 level # - Reject (H0) null hypothesis if the p-value <= 0.05 means there is no cointegrating equations. # ### Eigen and Trace statistic (Johansent co-integration) # In[10]:
#Palabras=Palabras[(Palabras['IDIOMA']=='en')&(Palabras['TIPO']=='POSITIVO')][['OPERATION_NUMBER','WORDS']] #Esta versión arroja nube de palabras incompleta Palabras = Palabras[(Palabras['TIPO'] == 'POSITIVO') | (Palabras['TIPO'] == 'NEUTRO POSITIVO')][[ 'OPERATION_NUMBER', 'WORDS', 'TIPO' ]] Palabras["WORDS2"] = Palabras["WORDS"].apply(singular) Palabras = Palabras[["OPERATION_NUMBER", "WORDS2", "TIPO"]] Palabras.rename(columns={'WORDS2': 'WORDS'}, inplace=True) #Palabras=DataFrame(Palabras["PALABRAS","WORDS"].groupby([Palabras['OPERATION_NUMBER']],Palabras['WORDS','PALABRAS']).count()) #Esta línea no corre, lo puse como está en la versión de EDU_IADB_cartera_digital que si corre #Palabras=DataFrame(Palabras["WORDS"].groupby([Palabras['OPERATION_NUMBER'],Palabras['WORDS']]).count()) Palabras = DataFrame(Palabras['WORDS'].groupby( [Palabras['OPERATION_NUMBER'], Palabras['WORDS'], Palabras['TIPO']]).count()) Palabras.rename(columns={'WORDS': 'COUNT_WORDS'}, inplace=True) Palabras.rename(columns={'PALABRAS': 'COUNT_WORDS'}, inplace=True) Palabras.reset_index(inplace=True) ######### Juntar archivo de pipeline con revisión de texto para ver cuáles salieron como dig ####### Base_pipe = Metadatos_pipe[[ 'OPERATION_NUMBER', 'OPERATION_NAME', 'PIPE_YR', 'OPERATION_TYPE', 'OPERATION_TYPE_NAME', 'OPERATION_MODALITY', 'TAXONOMY', 'STATUS', 'REGION', 'COUNTRY', 'DEPARTMENT', 'DIVISION', 'TEAM_LEADER_NM', 'APPROVAL_DATE', 'APPROVAL_AMOUNT', 'CURRENT_EXPIRATION_DATE' ]] oper_proc = Bas[[ 'OPERATION_NUMBER', 'DUMMY_DIGITAL', 'DUMMY_OBJETIVO_DIG', 'DUMMY_OUTPUT_DIG', 'DIG_OUTPUT_DESCRIPTION'
def test_rename_mi(self): df = DataFrame([11, 21, 31], index=MultiIndex.from_tuples([ ("A", x) for x in ["a", "B", "c"] ])) result = df.rename(str.lower)
def flows(futures, start=None, end=None, var=None, roll=None): position = futures.position market = futures.market # market1 = futures.p market = DataFrame( list(market.find({ 'date': { '$gte': start }, 'variety': var }))) position = DataFrame( list(position.find({ 'date': { '$gte': start }, 'variety': var }))).drop_duplicates(['date', 'variety', 'symbol', 'long_party_name'], 'last') # position = position[['date','varie']] # position = position[position['long_party_name'].notna()] # 持仓 # 所有会员 party_name = position[position['date'] == end] long_party_name = party_name['long_party_name'] short_party_name = party_name['short_party_name'] party_name = long_party_name.append( short_party_name).dropna().drop_duplicates() # 多空变化量求和 long = position.groupby(['date', 'variety', 'long_party_name' ])[['long_openIntr', 'long_openIntr_chg']].sum() # print(long) short = position.groupby(['date', 'variety', 'short_party_name' ])[['short_openIntr', 'short_openIntr_chg']].sum() # # 合并 frames = [long, short] position = pd.concat(frames, axis=1, sort=True).fillna(0).reset_index() # 字段更名 position = position.rename(columns={ 'level_0': 'date', 'level_1': 'variety', 'level_2': 'BrokerID' }) # ##行情 market = market.copy() # 指数收盘 market['cv'] = market.apply(lambda x: x['close'] * x['open_interest'], axis=1) closes = market.groupby(['date', 'variety'])[['cv', 'open_interest']].sum() closes['close_index'] = closes['cv'] / closes['open_interest'] # #指数开盘 market['ov'] = market.apply(lambda x: x['open'] * x['open_interest'], axis=1) opens = market.groupby(['date', 'variety'])[['ov', 'open_interest']].sum() closes['open_index'] = opens['ov'] / opens['open_interest'] # 价格变化量 closes['change_index'] = closes.apply( lambda x: x['close_index'] - x['open_index'], axis=1) closes = closes.reset_index() chg = closes[['date', 'variety', 'close_index', 'change_index']] # print(chg['change_index']) # print(merge) df = pd.DataFrame() for i in party_name: try: chg = chg.copy() # print(chg) chg['BrokerID'] = i position1 = position[position['BrokerID'] == i] # 两表合并 mem = pd.merge(chg, position1, on=['date', 'variety', 'BrokerID'], how='left').fillna(0) # mem = merge[merge['BrokerID'] == i] # print(mem) mem = mem.copy() mem['today_net'] = mem.apply( lambda x: x['long_openIntr'] - x['short_openIntr'], axis=1) mem['yesterday_net'] = mem.groupby(['variety', 'BrokerID' ])['today_net'].shift(1) mem['tomorrow_chg'] = mem.groupby(['variety', 'BrokerID' ])['change_index'].shift(-1) mem['net_chg'] = mem.apply( lambda x: x['today_net'] - x['yesterday_net'], axis=1) # mem['count'] = mem['net_chg'].count() # mem = mem.rename(columns={'long_open_interest': 'long_openIntr', 'long_open_interest_chg': 'long_openIntr_chg', 'short_open_interest': 'short_openIntr','short_open_interest_chg': 'short_openIntr_chg'}) # mem['change'] = mem.groupby(['variety', 'BrokerID'])['close_index'].shif(1) mem['change'] = mem['close_index'] - mem['close_index'].shift(1) # 时间窗口相关系数 # mem['corr'] = mem['net_chg'].rolling(window=240).corr(mem['change_index']) # mem['corr2'] = mem['net_chg'].rolling(window=240).corr(mem['tomorrow_chg']).shift(1) # mem['corr3'] = mem['today_net'].rolling(window=240).corr(mem['change']) # mem['lot'] = 0 # mem = mem.copy() mem['lot'] = mem.apply(lambda x: 0 if x['today_net'] == 0 else 1 if x['today_net'] > 0 else -1, axis=1) mem['lot'] = mem['lot'].shift(1).fillna(0) mem['pnl'] = mem['change'] * mem['lot'] # mem['fee']=0 # mem['fee'][mem['lot'] != mem['lot'].shif(1)] = mem['close_index'] * 2*1 mem['netpnl'] = mem['pnl'] mem['cumpnl'] = mem['netpnl'].rolling(roll).sum() # mem['date'] = pd.to_datetime(mem['date']) # #画图 # mem = mem.set_index('date') # with pd.plotting.plot_params.use('x_compat', True): # 方法一 # mem[['cumpnl']].plot(color='r',title=mem[u'BrokerID'][0]+" "+var+' '+end) # mem['today_net'].plot(secondary_y=['today_net']) # plt.ylabel('净持仓') # plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 # plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # plt.show() # plt.plot(mem['cumpnl']) # print(mem) # flows = mem[mem['cumpnl'] > 0] # flows.sort_values('cumpnl', inplace=False) # print(flows) # # flows = flows[['date', 'variety', 'BrokerID', 'corr', 'corr2', 'today_net', 'net_chg', 'corr3', # 'cumpnl']].sort_values('cumpnl', # inplace=False) # [['date','variety','BrokerID','corr','corr2','cumpnl']] # flows = flows.rename(columns={'today_net': '净持仓', 'cumpnl': '累计盈亏点数', 'net_chg': '净持仓变化量', 'corr3': '相关系数'}) # print(flows[['variety','BrokerID','净持仓','净持仓变化量','累计盈亏点数']]) # print(flows) # print(flows.sort_values('累计盈亏点数')) # mem=mem.groupby() # print(mem) # print(flows['净持仓'].sum()) # mem = mem[-1:] print(mem) df1 = pd.DataFrame(mem) df = df.append(df1) # print(df.tail(20)) except: continue return df
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame: """ Cleans the column names of the provided Pandas Dataframe and optionally provides hints on duplicate \ and long column names. Parameters ---------- data : pd.DataFrame Original Dataframe with columns to be cleaned hints : bool, optional Print out hints on column name duplication and colum name length, by default True Returns ------- pd.DataFrame Pandas DataFrame with cleaned column names """ _validate_input_bool(hints, "hints") for i, col in enumerate(data.columns): matches = re.findall(re.compile("[a-z][A-Z]"), col) column = col for match in matches: column = column.replace(match, match[0] + "_" + match[1]) data.rename(columns={data.columns[i]: column}, inplace=True) data.columns = ( data.columns.str.replace("\n", "_") .str.replace("(", "_") .str.replace(")", "_") .str.replace("'", "_") .str.replace('"', "_") .str.replace(".", "_") .str.replace("!", "_") .str.replace("?", "_") .str.replace(":", "_") .str.replace(";", "_") .str.replace("-", "_") .str.replace("/", "_") .str.replace("+", "_plus_") .str.replace("*", "_times_") .str.replace("ä", "ae") .str.replace("ö", "oe") .str.replace("ü", "ue") .str.replace("ß", "ss") .str.replace("%", "_percent_") .str.replace("$", "_dollar_") .str.replace("€", "_euro_") .str.replace("@", "_at_") .str.replace("#", "_number_") .str.replace("&", "_and_") .str.lower() .str.replace(" ", " ") .str.replace(" ", " ") .str.replace(" ", "_") .str.replace("___", "_") .str.replace("__", "_") .str.strip("_") ) dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x] if len(dupl_idx) > 0: dupl_before = data.columns[dupl_idx].tolist() data.columns = [ col if col not in data.columns[:i] else col + "_" + str(i) for i, col in enumerate(data.columns) ] if hints: print( f"- Duplicate column names detected! Columns with index {dupl_idx} and names {dupl_before}) " f"have been renamed to {data.columns[dupl_idx].tolist()}." ) long_col_names = [x for x in data.columns if len(x) > 25] if len(long_col_names) > 0 and hints: print( f"- Long column names detected (>25 characters)! Consider renaming the following columns " f"{long_col_names}." ) return data
def rename_columns(input: pd.DataFrame) -> pd.DataFrame: return input.rename(columns={ "reported": "total_vaccinations", })
def rename_columns(input: pd.DataFrame) -> pd.DataFrame: return input.rename(columns={"data": "date"})
n = 0 ranked = [] for i in gotabla['Entry'].drop_duplicates(): n+=1 ranked.append([i, str(n)]) rank = DataFrame(ranked, columns = ['Entry', 'label']) gotabla = gotabla.merge(rank, on = 'Entry', how = 'left') gotabla = gotabla.merge(no_anotadas_uniprot[0][['Entry', 'GO']], on = ['Entry', 'GO'], how = 'left') gotabla = gotabla.merge(uniprot_entry_go_term[['Entry', 'Gene']], on = 'Entry', how = 'left') gotabla = gotabla.merge(list_input[['Entry', 'values']], on = 'Entry', how = 'left') edges_frame_excel = gotabla[['GO','Entry', 'Gene', 'Term','values']] edges_frame_excel_uniprot[z] = edges_frame_excel if labelnode == 'Gene Name': gotabla = gotabla.rename({'Gene':'Entry', 'Entry':'Gene'}, axis='columns') if labelnode == 'UniProt ID': pass go_tablas_uniprot[z] = gotabla.drop_duplicates().reset_index(drop = True) del gotabla del edges_frame_excel else: if aprobados_uniprot[z].count().iloc[0] == 1: df = aprobados_uniprot[z] df['Short_Term'] = termino_corto(df = aprobados_uniprot[z]) significativos = [] for x in df.base.drop_duplicates(): dff = df[df.base == x] for index, row in dff.iterrows(): for i in row.entry.split(';'):
def rename_total_to_votes(df: pd.DataFrame): if 'total' in df.columns: return df.rename(columns={'total': 'votes'}) else: return df
def clean_colnames(data: pd.DataFrame) -> pd.DataFrame: """ Make sure that all column names are lowercase and don't contain spaces """ clean_names = {x: x.lower().replace(" ", "_") for x in data.columns} return data.rename(columns=clean_names)
def rename_columns(df: pd.DataFrame) -> pd.DataFrame: return df.rename( columns={ "DATE": "date", } )
import numpy as np df_words = DataFrame(pd.read_csv (r'data_ASL-LEX_DB.csv', sep=';', header='infer', error_bad_lines=False)) df_words.head() # Clean words: Remove probabilities from ASL-LEX data (x.x) k=0 l_new_words = [] for i in range(len(df_words)): l_new_words.append([]) for index, row in df_words.iterrows(): for words in [row.values]: for word in words: if isinstance(word, str): clean_word = re.sub(r'\ \(.*$', "", word) clean_word = clean_word.upper() # print(clean_word) # print(k) l_new_words[k].append(clean_word) else: l_new_words[k].append(np.NaN) k=k+1 df_ASLLEX_clean = DataFrame(l_new_words) # print(df_new_words) df_ASLLEX_clean.rename(columns={0: 'Gloss'}, inplace=True) df_ASLLEX_clean.to_csv('ASL-LEX_clean.csv', sep=',', index = False)
# # # 重命名轴索引 # # # # 跟Series中的值一样,轴标签也可以通过函数或映射进行转换,从而得到一个新对象,轴还可以被就地修改,而无需新建一个数据结构 data = DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four']) # # # # 同Series一样,轴标签也有一个map方法 # print(data.index.map(str.upper)) data.index = data.index.map(str.upper) # print(data) # # # # 如果想要创建数据及的转换版(而不是修改原始数据),比较使用的方法是rename # print(data.rename(index=str.title, columns=str.upper)) # # # # rename可以结合字典型对象实现对部分轴标签的更新 # print(data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekboo'})) # # # # rename帮我们实现了:复制DF并对其索引和列标签进行赋值.如果希望就地修改某个数据集,传入inplace=True即可 _ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True) # print(data) # # # 离散化和面元划分 # # # # 为了便于分析,连续数据常常被离散化或拆分为'面元'(bin) age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # # # # 将这些age划分为"18-25,26-35,35-60,60+"几个面元,要实现该功能,需要用pandas的cut函数 bins = [18, 25, 35, 60, 100] cats = pd.cut(age, bins) # print(cats) # # # # pandas返回的是一个特殊的Categories对象,可以将其看作一组表示面元名称的字符串.实际上,它还有一个表示不同分类名称的levels数组以及一个年龄数据进行行标号的labels属性 # print(cats.codes) # print(cats.categories) # print(pd.value_counts(cats)) # # # # 跟区间的数学符号一样,圆括号表示开端,而方括号则表示闭段,那边是闭端可以通过right=False进行修改 # print(pd.cut(age, [18, 26, 36, 61, 100], right=False))
class scene_graph(object): def __init__(self, args): #self.data = DataFrame({"node_feature":[]}, index=[]) # [class, index, score, bounding_box, 3d_pose, mean, var, pt_number, color] self.data = DataFrame( { "class": [np.zeros(400)], "idx": 0, "score": [np.zeros(400)], #check "bounding_box": [[0, 0, 0, 0]], "3d_pose": [[0, 0, 0]], "mean": [[0, 0, 0]], "var": [[0, 0, 0]], "pt_num": 0, "color_hist": [[[0, "red"], [0, "blue"]]], "detection_cnt:": 0 }, columns=[ 'class', 'idx', 'score', 'bounding_box', '3d_pose', 'mean', 'var', 'pt_num', 'color_hist', 'detection_cnt' ]) self.rel_data = DataFrame({"relation": []}, index=[]) self.img_count = 0 self.pt_num = 0 self.mean = [0, 0, 0] self.var = [0, 0, 0] self.args = args self.detect_cnt_thres = args.detect_cnt_thres self.fig = plt.figure() scene_name = args.scannet_path.split('/')[-1] self.save_path = osp.join(self.args.vis_result_path, scene_name, 'scene_graph') try: os.makedirs(self.save_path) except: pass try: os.makedirs(osp.join(self.save_path, 'json')) except: pass self.disable_samenode = self.args.disable_samenode if self.disable_samenode: self.detect_cnt_thres = 0 def vis_scene_graph(self, image_scene, idx, test_set, obj_inds, obj_boxes, obj_scores, subject_inds, predicate_inds, object_inds, subject_IDs, object_IDs, triplet_scores, relationships, pix_depth=None, inv_p_matrix=None, inv_R=None, Trans=None, dataset='scannet'): updated_image_scene = image_scene.copy() sg = Digraph('structs', node_attr={'shape': 'plaintext'}) # initialize scene graph tool if dataset == 'scannet': #scannet print( '-ID--|----Object-----|Score|3D_position (x, y, z)|---var-------------|---color------' ) else: print( '-ID--|----Object-----|---Score---------------------------------------------' ) ax = self.fig.add_subplot(111, projection='3d') #print('sfdsfdfsd:',obj_boxes.shape) for i, obj_ind in enumerate( obj_inds): # loop for bounding boxes on each images if dataset == 'scannet': '''1. Get Color Histogram''' # color_hist # ex: [[num_pixels1,color1],[num_pixels2,color2],...,[num_pixelsN,colorN]] # [[362 ,'red' ],[2 ,'blue'],...,[3 ,'gray']] box_whole_img = image_scene[ int(obj_boxes[i][1]):int(obj_boxes[i][3]), int(obj_boxes[i][0]):int(obj_boxes[i][2])] color_hist = get_color_hist2(box_whole_img) '''2. Get Center Patch ''' # Define bounding box info width = int(obj_boxes[i][2]) - int(obj_boxes[i][0]) height = int(obj_boxes[i][3]) - int(obj_boxes[i][1]) box_center_x = int(obj_boxes[i][0]) + width / 2 box_center_y = int(obj_boxes[i][1]) + height / 2 # using belows to find mean and variance of each bounding boxes # pop 1/5 size window_box from object bounding boxes range_x_min, range_x_max, range_y_min, range_y_max = make_window_size( width, height, obj_boxes[i]) # Crop center patch box_center_img = image_scene[range_y_min:range_y_max, range_x_min:range_x_max] '''3. Get 3D positions of the Centor Patch''' window_3d_pts = [] for pt_x in range(range_x_min, range_x_max): for pt_y in range(range_y_min, range_y_max): pose_2d_window = np.matrix([pt_x, pt_y, 1]) pose_3d_window = pix_depth[pt_x][pt_y] * np.matmul( inv_p_matrix, pose_2d_window.transpose()) pose_3d_world_coord_window = np.matmul( inv_R, pose_3d_window[0:3] - Trans.transpose()) if not isNoisyPoint(pose_3d_world_coord_window): # save several points in window_box to calculate mean and variance window_3d_pts.append([ pose_3d_world_coord_window.item(0), pose_3d_world_coord_window.item(1), pose_3d_world_coord_window.item(2) ]) # window_3d_pts # ex: [[X_1,Y_1,Z_1],[X_2,Y_2,Z_2],...,[X_N,Y_N,Z_N]] # window_3d_pts = [] # for pt_x in range(int(obj_boxes[i][0]), int(obj_boxes[i][2])): # for pt_y in range(int(obj_boxes[i][1]), int(obj_boxes[i][3])): # pose_2d_window = np.matrix([pt_x, pt_y, 1]) # pose_3d_window = pix_depth[pt_x][pt_y] * np.matmul(inv_p_matrix, pose_2d_window.transpose()) # pose_3d_world_coord_window = np.matmul(inv_R, pose_3d_window[0:3] - Trans.transpose()) # if not isNoisyPoint(pose_3d_world_coord_window): # # save several points in window_box to calculate mean and variance # window_3d_pts.append([pose_3d_world_coord_window.item(0), pose_3d_world_coord_window.item(1), pose_3d_world_coord_window.item(2)]) window_3d_pts = outlier_filter(window_3d_pts) #window_3d_pts = np.array(window_3d_pts,dtype=np.float32) #cloud = pcl.PointCloud() #cloud.from_array(window_3d_pts) #outlier_filter = cloud.make_statistical_outlier_filter() #outlier_filter.set_mean_k(min(len(window_3d_pts),10)) #outlier_filter.set_std_dev_mul_thresh(1.0) #cloud_filtered = outlier_filter.filter() #window_3d_pts = cloud_filtered.to_array().tolist() # arr = np.array(window_3d_pts,dtype=np.float).reshape(-1,3) # if arr.size>0: # ax.scatter(-arr[:,0],-arr[:,1],-arr[:,2],) # #ax.set_xlim(-2000, 2000) # #ax.set_ylim(-2000, 2000) # #ax.set_zlim(-2000, 2000) # # self.fig.show() # plt.pause(0.01) # plt.hold(True) # cv2.waitKey(0) '''4. Get a 3D position of the Center Patch's Center point''' # find 3D point of the bounding box(the center patch)'s center curr_pt_num, curr_mean, curr_var = Measure_new_Gaussian_distribution( window_3d_pts) # ex: np.matrix([[X_1],[Y_1],[Z_1]]) # get object class names as strings box_cls = [ test_set.object_classes[obj_ind[0]], test_set.object_classes[obj_ind[1]], test_set.object_classes[obj_ind[2]] ] # box_cls: ['pillow','bag','cat'] box_score = obj_scores[i] # box_score: [0.2,0.1,0.01] cls_scores = np.zeros(400) for cls_idx, cls_score in zip(obj_ind, obj_scores[i]): cls_scores[cls_idx] += cls_score # check '''5. Save Object Recognition Results in DataFrame Format''' if (self.img_count == 0): # first image -> make new node box_id = i self.pt_num, self.mean, self.var = Measure_new_Gaussian_distribution( window_3d_pts) # check start_data = { "class": cls_scores, "idx": box_id, "score": box_score, "bounding_box": [box_center_x, box_center_y, width, height], "3d_pose": [ int(self.mean[0]), int(self.mean[1]), int(self.mean[2]) ], "mean": self.mean, "var": self.var, "pt_num": self.pt_num, "color_hist": color_hist, "detection_cnt": 1 } obj_boxes[i][4] = box_id self.data.loc[len(self.data)] = start_data if (i == 0): self.data.drop(self.data.index[0], inplace=True) self.data.rename(index={1: 0}, inplace=True) #print(self.data) else: # get node similarity score node_score, max_score_index = node_update( window_3d_pts, self.data, curr_mean, curr_var, box_cls, obj_scores[i], color_hist, test_set) threshold = 0.8127 if node_score > threshold and not self.disable_samenode: # change value of global_node # change global_node[max_score_index] print("node updated!!!") for cls_idx, cls_score in zip(obj_ind, obj_scores[i]): self.data.at[ max_score_index, 'class'][cls_idx] += cls_score # check #self.data.at[max_score_index, "class"] = box_cls self.data.at[max_score_index, "score"] = node_score self.pt_num, self.mean, self.var = Measure_added_Gaussian_distribution( window_3d_pts, self.data.ix[max_score_index]["mean"], self.data.ix[max_score_index]["var"], self.data.ix[max_score_index]["pt_num"], len(window_3d_pts)) self.data.at[max_score_index, "mean"] = self.mean self.data.at[max_score_index, "var"] = self.var self.data.at[max_score_index, "pt_num"] = self.pt_num self.data.at[max_score_index, "color_hist"] = color_hist self.data.at[max_score_index, "detection_cnt"] = self.data.ix[ max_score_index]["detection_cnt"] + 1 box_id = self.data.ix[max_score_index]["idx"] obj_boxes[i][4] = box_id else: # make new_node in global_node # [class, index, score, bounding_box, 3d_pose, mean, var, pt_number, color_hist] box_id = len(self.data) + 1 obj_boxes[i][4] = box_id self.pt_num, self.mean, self.var = Measure_new_Gaussian_distribution( window_3d_pts) global_node_num = len(self.data) add_node_list = [ cls_scores, box_id, box_score, [box_center_x, box_center_y, width, height], [self.mean[0], self.mean[1], self.mean[2]], self.mean, self.var, self.pt_num, color_hist, 1 ] self.data.loc[len(self.data)] = add_node_list # if object index was changed, update relation's object index also '''6. Print object info''' print( '{obj_ID:5} {obj_cls:15} {obj_score:4.2f} {object_3d_pose:20} {obj_var:20} {obj_color:15}' .format(obj_ID=box_id, obj_cls=box_cls[0], obj_score=box_score[0], object_3d_pose=[ self.mean[0], self.mean[1], self.mean[2] ], obj_var=self.var, obj_color=color_hist[0][1])) else: # TODO: for visual_genome raise NotImplementedError '''7. Plot ''' # updated object_detection cv2.rectangle(updated_image_scene, (int(obj_boxes[i][0]), int(obj_boxes[i][1])), (int(obj_boxes[i][2]), int(obj_boxes[i][3])), colorlist[int(obj_boxes[i][4])], 2) font_scale = 0.5 txt = str(box_id) + '. ' + str(box_cls[0]) + ' ' + str( round(box_score[0], 2)) ((txt_w, txt_h), _) = cv2.getTextSize(txt, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1) # Place text background. x0, y0 = int(obj_boxes[i][0]), int(obj_boxes[i][3]) back_tl = x0, y0 - int(1.3 * txt_h) back_br = x0 + txt_w, y0 cv2.rectangle(updated_image_scene, back_tl, back_br, colorlist[int(obj_boxes[i][4])], -1) cv2.putText(updated_image_scene, txt, (x0, y0 - 2), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), 1) # add ID per bbox rel_prev_num = len(self.rel_data) print( '-------Subject--------|-------Predicate-----|--------Object---------|--Score-' ) for i, relation in enumerate(relationships): # update relation's class also # accumulate relation_list if str(int(obj_boxes[int(relation[0])][4])) != str( int(obj_boxes[int(relation[1])][4])): # filter out triplets whose sbj == obj self.rel_data.loc[len(self.rel_data)] = [[ str(int(obj_boxes[int(relation[0])][4])), int(relation[2]), str(int(obj_boxes[int(relation[1])][4])) ]] print('{sbj_cls:9} {sbj_ID:4} {sbj_score:1.3f} | ' '{pred_cls:11} {pred_score:1.3f} | ' '{obj_cls:9} {obj_ID:4} {obj_score:1.3f} | ' '{triplet_score:1.3f}'.format( sbj_cls=test_set.object_classes[obj_inds[:, 0][int( relation[0])]], sbj_score=obj_scores[:, 0][int(relation[0])], sbj_ID=str(int(obj_boxes[int(relation[0])][4])), pred_cls=test_set.predicate_classes[int( relation[2])], pred_score=relation[3] / obj_scores[:, 0][int(relation[0])] / obj_scores[:, 0][int(relation[1])], obj_cls=test_set.object_classes[obj_inds[:, 0][int( relation[1])]], obj_score=obj_scores[:, 0][int(relation[1])], obj_ID=str(int(obj_boxes[int(relation[1])][4])), triplet_score=relation[3])) rel_new_num = len(self.rel_data) # Draw scene graph if (rel_prev_num != rel_new_num): Draw_connected_scene_graph(self.data, self.rel_data, self.img_count, test_set, sg, idx, self.detect_cnt_thres, self.args.plot_graph, self.save_path) #sg.view() # it's help to select starting point of first image manually self.img_count += 1 return updated_image_scene
def estimate_time_series( data: pd.DataFrame, spline_options: Dict, n_knots: int, dep_var: str, dep_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, weight_data: pd.DataFrame = None, dep_var_se: str = None, dep_se_trans_in: Callable[[pd.Series], pd.Series] = lambda x: x, diff: bool = False, num_submodels: int = 25, single_random_knot: bool = False, min_interval_days: int = 7, dep_trans_out: Callable[[pd.Series], pd.Series] = lambda x: x, split_l_interval: bool = False, split_r_interval: bool = False, verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.Series, MRBeRT]: if verbose: logger.info('Formatting data.') data = data.copy() data[dep_var] = dep_trans_in(data[dep_var]) if diff: if verbose: logger.info( 'For diff model, drop day1 (i.e., if day0 is > 0, day0->day1 diff would be hugely negative).' ) data[dep_var] = data[dep_var].diff() data[dep_var] = data[dep_var][data[dep_var].diff().notnull()] if data[[dep_var]].shape[1] > 1: reshape = True data = reshape_data_long(data, dep_var) if weight_data is not None: weight_data = reshape_data_long(weight_data, dep_var_se) else: reshape = False if weight_data is not None: if (data['date'] != weight_data['date']).any(): raise ValueError( 'Dates in `data` and `weight_data` not identical.') data['se'] = dep_se_trans_in(weight_data[dep_var_se]) else: data['se'] = 1. data = data.rename(columns={dep_var: 'y'}) day0 = data['date'].min() keep_vars = ['date', 'y', 'se'] data = data.loc[:, keep_vars] start_len = len(data) data = data.dropna() end_len = len(data) if start_len != end_len and not reshape: if verbose: logger.debug('NAs in data') data['t'] = (data['date'] - day0).dt.days col_args = { 'col_obs': 'y', 'col_obs_se': 'se', 'col_covs': ['t'], #'col_study_id':'date', } if verbose: logger.info('Getting base knots.') min_interval = min_interval_days / data['t'].max() if num_submodels == 1 and single_random_knot: spline_knots = get_ensemble_knots(n_knots, min_interval, 1)[0] else: spline_knots = np.linspace(0., 1., n_knots) if split_l_interval or split_r_interval: if num_submodels > 1: raise ValueError( 'Would need to set up functionality to split segments for ensemble.' ) if split_l_interval: n_knots += 1 spline_knots = np.insert(spline_knots, 0, spline_knots[:2].mean()) if split_r_interval: n_knots += 1 spline_knots = np.insert(spline_knots, -1, spline_knots[-2:].mean()) if verbose: logger.info('Creating model data.') mr_data = MRData() mr_data.load_df(data, **col_args) spline_model = LinearCovModel('t', use_re=False, use_spline=True, use_spline_intercept=True, spline_knots=spline_knots, **spline_options) if num_submodels > 1: if verbose: logger.info('Sampling knots.') ensemble_knots = get_ensemble_knots(n_knots, min_interval, num_submodels) if verbose: logger.info('Initializing model.') mr_model = MRBeRT(mr_data, spline_model, ensemble_knots) else: if verbose: logger.info('Initializing model.') mr_model = MRBRT(mr_data, [spline_model]) if verbose: logger.info('Fitting model.') mr_model.fit_model() if num_submodels > 1: if verbose: logger.info('Scoring submodels.') mr_model.score_model() data = data.set_index('date')[['y', 'se']] if verbose: logger.info('Making prediction.') smooth_data = predict_time_series( day0=day0, dep_var=dep_var, mr_model=mr_model, dep_trans_out=dep_trans_out, diff=diff, ) return data, smooth_data, mr_model
def rename_columns(self, df: pd.DataFrame) -> pd.DataFrame: if self.columns_rename: return df.rename(columns=self.columns_rename) return df
a = np.arange(1,17).reshape(4,4) data = DataFrame(a) data #计算列的百分比变化,如果想计算行设置axis=1 data.pct_change() #新值相对旧值增加的百分比 print(data.head()) #输出前五行,默认是5,可以通过设置n参数来设置输出的行数 print(data.tail()) #输出最后五行 #计算dataframe中列与列的相关系数 #重新修改索引名 row=['a','b','c','d'] col=['one','two','three','four'] data.index data.columns data.rename(index={2:'mm'}) #利用rename修改单个索引名 data.rename(columns={1:'xxx'}) data.index=row #利用index对象修改整个索引名 data.columns=col data data_pc=data.pct_change() #计算dataframe的列与列的相关系数 data_pc.one.corr(data_pc.three) #计算dataframe的列与列的协方差 data_pc.two.cov(data_pc.four) #返回打dataframe的各列间的相关系数与协方差,返回矩阵结果 data_pc.corr()
def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, errors='raise', sep='.'): """ Normalize semi-structured JSON data into a flat table. Parameters ---------- data : dict or list of dicts Unserialized JSON objects record_path : string or list of strings, default None Path in each object to list of records. If not passed, data will be assumed to be an array of records meta : list of paths (string or list of strings), default None Fields to use as metadata for each record in resulting table meta_prefix : string, default None record_prefix : string, default None If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] errors : {'raise', 'ignore'}, default 'raise' * 'ignore' : will ignore KeyError if keys listed in meta are not always present * 'raise' : will raise KeyError if keys listed in meta are not always present .. versionadded:: 0.20.0 sep : string, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar .. versionadded:: 0.20.0 Returns ------- frame : DataFrame Examples -------- >>> from pandas.io.json import json_normalize >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, ... {'name': {'given': 'Mose', 'family': 'Regner'}}, ... {'id': 2, 'name': 'Faye Raker'}] >>> json_normalize(data) id name name.family name.first name.given name.last 0 1.0 NaN NaN Coleen NaN Volk 1 NaN NaN Regner NaN Mose NaN 2 2.0 Faye Raker NaN NaN NaN NaN >>> data = [{'state': 'Florida', ... 'shortname': 'FL', ... 'info': { ... 'governor': 'Rick Scott' ... }, ... 'counties': [{'name': 'Dade', 'population': 12345}, ... {'name': 'Broward', 'population': 40000}, ... {'name': 'Palm Beach', 'population': 60000}]}, ... {'state': 'Ohio', ... 'shortname': 'OH', ... 'info': { ... 'governor': 'John Kasich' ... }, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] >>> result = json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population info.governor state shortname 0 Dade 12345 Rick Scott Florida FL 1 Broward 40000 Rick Scott Florida FL 2 Palm Beach 60000 Rick Scott Florida FL 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH >>> data = {'A': [1, 2]} >>> json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 """ def _pull_field(js, spec): result = js if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] return result if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob if isinstance(data, dict): data = [data] if record_path is None: if any([isinstance(x, dict) for x in compat.itervalues(y)] for y in data): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # # TODO: handle record value which are lists, at least error # reasonably data = nested_to_record(data, sep=sep) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] if meta is None: meta = [] elif not isinstance(meta, list): meta = [meta] meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] lengths = [] meta_vals = defaultdict(list) if not isinstance(sep, compat.string_types): sep = str(sep) meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: for val, key in zip(meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) # For repeating the metadata later lengths.append(len(recs)) for val, key in zip(meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: if errors == 'ignore': meta_val = np.nan else: raise KeyError( "Try running with " "errors='ignore' as key " "{err} is not always present".format( err=e)) meta_vals[key].append(meta_val) records.extend(recs) _recursive_extract(data, record_path, {}, level=0) result = DataFrame(records) if record_prefix is not None: result = result.rename( columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): if meta_prefix is not None: k = meta_prefix + k if k in result: raise ValueError('Conflicting metadata name {name}, ' 'need distinguishing prefix '.format(name=k)) result[k] = np.array(v).repeat(lengths) return result
def prophet( # pylint: disable=too-many-arguments df: DataFrame, time_grain: str, periods: int, confidence_interval: float, yearly_seasonality: Optional[Union[bool, int]] = None, weekly_seasonality: Optional[Union[bool, int]] = None, daily_seasonality: Optional[Union[bool, int]] = None, ) -> DataFrame: """ Add forecasts to each series in a timeseries dataframe, along with confidence intervals for the prediction. For each series, the operation creates three new columns with the column name suffixed with the following values: - `__yhat`: the forecast for the given date - `__yhat_lower`: the lower bound of the forecast for the given date - `__yhat_upper`: the upper bound of the forecast for the given date - `__yhat_upper`: the upper bound of the forecast for the given date :param df: DataFrame containing all-numeric data (temporal column ignored) :param time_grain: Time grain used to specify time period increments in prediction :param periods: Time periods (in units of `time_grain`) to predict into the future :param confidence_interval: Width of predicted confidence interval :param yearly_seasonality: Should yearly seasonality be applied. An integer value will specify Fourier order of seasonality. :param weekly_seasonality: Should weekly seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :param daily_seasonality: Should daily seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :return: DataFrame with contributions, with temporal column at beginning if present """ # validate inputs if not time_grain: raise QueryObjectValidationError(_("Time grain missing")) if time_grain not in PROPHET_TIME_GRAIN_MAP: raise QueryObjectValidationError( _( "Unsupported time grain: %(time_grain)s", time_grain=time_grain, )) freq = PROPHET_TIME_GRAIN_MAP[time_grain] # check type at runtime due to marhsmallow schema not being able to handle # union types if not periods or periods < 0 or not isinstance(periods, int): raise QueryObjectValidationError( _("Periods must be a positive integer value")) if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1: raise QueryObjectValidationError( _("Confidence interval must be between 0 and 1 (exclusive)")) if DTTM_ALIAS not in df.columns: raise QueryObjectValidationError( _("DataFrame must include temporal column")) if len(df.columns) < 2: raise QueryObjectValidationError( _("DataFrame include at least one series")) target_df = DataFrame() for column in [column for column in df.columns if column != DTTM_ALIAS]: fit_df = _prophet_fit_and_predict( df=df[[DTTM_ALIAS, column]].rename(columns={ DTTM_ALIAS: "ds", column: "y" }), confidence_interval=confidence_interval, yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality), weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality), daily_seasonality=_prophet_parse_seasonality(daily_seasonality), periods=periods, freq=freq, ) new_columns = [ f"{column}__yhat", f"{column}__yhat_lower", f"{column}__yhat_upper", f"{column}", ] fit_df.columns = new_columns if target_df.empty: target_df = fit_df else: for new_column in new_columns: target_df = target_df.assign( **{new_column: fit_df[new_column]}) target_df.reset_index(level=0, inplace=True) return target_df.rename(columns={"ds": DTTM_ALIAS})
type(frame3['col2']) frame3[['col2']] type(frame3[['col2']]) frame3[['col2','col1']] type(frame3[['col2','col1']]) frame3.ix[1,'col1'] frame3.ix[[1,0],['col2','col1']] ##Besides .ix, one can also use .loc etc # Changing names of columns in a DataFrame frame1 #We can use rename() method to rename a column, rename accepts a dictionary frame1.rename(columns={'col1':'one','col2':'two'}) frame1 frame1.rename(columns={'col1':'one','col2':'two'},inplace=True) frame1 frame1.columns=['Col1','Col2'] frame1 #One at a time/a few at a time frame1.rename(columns={'Col1':'one'},inplace=True) frame1 # Basic DataFrame methods frame3.columns frame3.head()
def prepare_visualization_group(df: DataFrame = None, **kwargs) -> List[Any]: """Creates plot, table and download link for data frame. Arguments: df: The Dataframe to plot content: Dict[str, str] Mapping for translating columns and index. max_y_axis: int Maximal value on y-axis labels: List[str] Columns to display table_mod: int Displays only each `table_mod` row in table """ result = [{}, None, None] if df is not None and isinstance(df, DataFrame): date_column = "date" day_column = "day" # Translate column and index if specified content = kwargs.get("content", None) if content: columns = { col: content[col] for col in df.columns if col in content } index = ({ df.index.name: content[df.index.name] } if df.index.name and df.index.name in content else None) df = df.rename(columns=columns, index=index) date_column = content.get(date_column, date_column) day_column = content.get(day_column, day_column) plot_data = plot_dataframe( df.dropna().set_index(date_column).drop(columns=[day_column]), max_y_axis=kwargs.get("max_y_axis", None), ) # translate back for backwards compability of build_table column_map = {day_column: "day", date_column: "date"} table = ( df_to_html_table( build_table( df=df.rename(columns=column_map), labels=kwargs.get("labels", df.columns), modulo=kwargs.get("table_mod", 7), ), formats={ float: int, (date, datetime): lambda d: d.strftime(DATE_FORMAT), }, ) # if kwargs.get("show_tables", None) # else None ) # Convert columnnames to lowercase column_map = {col: col.lower() for col in df.columns} csv = build_csv_download(df.rename(columns=column_map)) result = [plot_data, table, csv] return result
def fix_misnamed_cols(df: pd.DataFrame) -> pd.DataFrame: if 'attribute' in df.columns and 'value' in df.columns: return df.rename(columns={'attribute': 'candidate', 'value': 'votes'}) else: return df
def translate_columns(input: pd.DataFrame) -> pd.DataFrame: return input.rename(columns={"datum": "date"})
class Device(object): ''' Main implementation of the device class ''' def __init__(self, blueprint=None, descriptor={}): ''' Creates an instance of device. Devices are objects that contain sensors readings, metrics (calculations based on sensors readings), and metadata such as units, dates, frequency and source Parameters: ----------- blueprint: String Default: 'sck_21' Defines the type of device. For instance: sck_21, sck_20, csic_station, muv_station parrot_soil, sc_20_station, sc_21_station. A list of all the blueprints is found in config.blueprints_urls and accessible via the scdata.utils.load_blueprints(urls) function. The blueprint can also be defined from the postprocessing info in SCAPI. The manual parameter passed here is overriden by that of the API. descriptor: dict() Default: empty: std_out('Empty dataframe, ignoring', 'WARNING') dict A dictionary containing information about the device itself. Depending on the blueprint, this descriptor needs to have different data. If not all the data is present, the corresponding blueprint's default will be used Examples: ---------- Device('sck_21', descriptor = {'source': 'api', 'id': '1919'}) device with sck_21 blueprint with 1919 ID Device(descriptor = {'source': 'api', 'id': '1919'}) device with sck_21 blueprint with 1919 ID Returns ---------- Device object ''' self.skip_blueprint = False if blueprint is not None: self.blueprint = blueprint self.skip_blueprint = True else: self.blueprint = 'sck_21' # Set attributes if self.blueprint not in config.blueprints: raise ValueError( f'Specified blueprint {self.blueprint} is not in config') self.set_blueprint_attrs(config.blueprints[self.blueprint]) self.blueprint_loaded_from_url = False self.hardware_loaded_from_url = False self.description = descriptor self.set_descriptor_attrs() if self.id is not None: self.id = str(self.id) # Postprocessing and forwarding self.hardware_url = None self.blueprint_url = None self.forwarding_params = None self.forwarding_request = None self.meta = None self.latest_postprocessing = None self.processed = False self.hardware_description = None # Add API handler if needed if self.source == 'api': hmod = __import__('scdata.io.device_api', fromlist=['io.device_api']) Hclass = getattr(hmod, self.sources[self.source]['handler']) # Create object self.api_device = Hclass(did=self.id) std_out(f'Checking postprocessing info from API device') if self.load_postprocessing() and ( self.hardware_url is None): # or self.blueprint_url is None): if config._strict: raise ValueError( 'Postprocessing could not be loaded as is incomplete and strict mode is enabled' ) std_out( f'Postprocessing loaded but with problems (hardware_url: {self.hardware_url} // blueprint_url: {self.blueprint_url}', 'WARNING') if self.blueprint is None: raise ValueError( f'Device {self.id} cannot be init without blueprint. Need a blueprint to proceed' ) else: std_out(f'Device {self.id} is using {self.blueprint} blueprint') self.readings = DataFrame() self.loaded = False self.options = dict() std_out(f'Device {self.id} initialised', 'SUCCESS') def set_blueprint_attrs(self, blueprintd): # Set attributes for bpitem in blueprintd: self.__setattr__(bpitem, blueprintd[bpitem]) def set_descriptor_attrs(self): # Descriptor attributes for ditem in self.description.keys(): if ditem not in vars(self): std_out(f'Ignoring {ditem} from input', 'WARNING') continue if type(self.__getattribute__(ditem)) == dict: self.__setattr__( ditem, dict_fmerge(self.__getattribute__(ditem), self.description[ditem])) else: self.__setattr__(ditem, self.description[ditem]) def check_overrides(self, options={}): if 'min_date' in options.keys(): self.options['min_date'] = options['min_date'] else: self.options['min_date'] = self.min_date if 'max_date' in options.keys(): self.options['max_date'] = options['max_date'] else: self.options['max_date'] = self.max_date if 'clean_na' in options.keys(): self.options['clean_na'] = options['clean_na'] else: self.options['clean_na'] = self.clean_na if 'frequency' in options.keys(): self.options['frequency'] = options['frequency'] elif self.frequency is not None: self.options['frequency'] = self.frequency else: self.options['frequency'] = '1Min' def load_postprocessing(self): if self.source != 'api': return None if self.sources[self.source]['handler'] != 'ScApiDevice': return None # Request to get postprocessing information if self.api_device.get_device_postprocessing() is None: return None # Put it where it goes try: self.hardware_url = self.api_device.postprocessing['hardware_url'] self.blueprint_url = self.api_device.postprocessing[ 'blueprint_url'] self.latest_postprocessing = self.api_device.postprocessing[ 'latest_postprocessing'] self.forwarding_params = self.api_device.postprocessing[ 'forwarding_params'] self.meta = self.api_device.postprocessing['meta'] inc_postprocessing = False except KeyError: std_out('Ignoring postprocessing info as its incomplete', 'WARNING') inc_postprocessing = True pass if inc_postprocessing: return None # Load postprocessing info from url if url_checker( self.hardware_url) and self.hardware_loaded_from_url == False: std_out(f'Loading hardware information from:\n{self.hardware_url}') hardware_description = get_json_from_url(self.hardware_url) # TODO # Add additional checks to hardware_description if hardware_description is not None: self.hardware_description = hardware_description std_out('Hardware described in url is valid', "SUCCESS") self.hardware_loaded_from_url = True else: std_out("Hardware in url is not valid", 'ERROR') self.hardware_description = None # Find forwarding request if self.hardware_description is not None: if 'forwarding' in self.hardware_description: if self.hardware_description[ 'forwarding'] in config.connectors: self.forwarding_request = self.hardware_description[ 'forwarding'] std_out( f"Requested a {self.hardware_description['forwarding']} connector for {self.id}" ) if self.forwarding_params is None: std_out( 'Assuming device has never been posted. Forwarding parameters are empty', 'WARNING') else: std_out( f'Connector parameters are not empty: {self.forwarding_params}' ) else: std_out( f"Requested a {self.hardware_description['forwarding']} connector that is not available. Ignoring", 'WARNING') # Find postprocessing blueprint if self.skip_blueprint: std_out( 'Skipping blueprint as it was defined in device constructor', 'WARNING') if self.blueprint_loaded_from_url == False and not self.skip_blueprint: # Case when there is no info stored if url_checker(self.blueprint_url): std_out( f'blueprint_url in platform is not empty. Loading postprocessing blueprint from:\n{self.blueprint_url}' ) nblueprint = basename(urlparse( self.blueprint_url).path).split('.')[0] else: std_out(f'blueprint_url in platform is not valid', 'WARNING') std_out( f'Checking if there is a blueprint_url in hardware_description' ) if self.hardware_description is None: std_out("Hardware description is not useful for blueprint", 'ERROR') return None if 'blueprint_url' in self.hardware_description: std_out( f"Trying postprocessing blueprint from:\n{self.hardware_description['blueprint_url']}" ) nblueprint = basename( urlparse(self.hardware_description['blueprint_url']). path).split('.')[0] tentative_urls = url_checker( self.hardware_description['blueprint_url']) if len(tentative_urls) > 0: self.blueprint_url = tentative_urls[0] else: std_out('Invalid blueprint', 'ERROR') return None else: std_out('Postprocessing not possible without blueprint', 'ERROR') return None std_out(f'Using hardware postprocessing blueprint: {nblueprint}') lblueprint = get_json_from_url(self.blueprint_url) if lblueprint is not None: self.blueprint = nblueprint self.blueprint_loaded_from_url = True self.set_blueprint_attrs(lblueprint) self.set_descriptor_attrs() std_out('Blueprint loaded from url', 'SUCCESS') else: std_out('Blueprint in url is not valid', 'ERROR') return None return self.api_device.postprocessing def validate(self): if self.hardware_description is not None: return True else: return False def load(self, options=None, path=None, convert_units=True, only_unprocessed=False, max_amount=None): ''' Loads the device with some options Parameters: ----------- options: dict() Default: None options['min_date'] = date to load data from Default to device min_date (from blueprint or test) options['max_date'] = date to load data to Default to device max_date (from blueprint or test) options['clean_na'] = clean na (drop_na, fill_na or None) Default to device clean_na (from blueprint or test) options['frequency'] = frequency to load data at in pandas format Default to device frequency (from blueprint or test) or '1Min' path: String Default: None Path were the csv file is, if any. Normally not needed to be provided, only for internal usage convert_units: bool Default: True Convert units for channels based on config._channel_lut only_unprocessed: bool Default: False Loads only unprocessed data max_amount: int Default: None Trim dataframe to this amount for processing and forwarding purposes Returns ---------- True if loaded correctly ''' # Add test overrides if we have them, otherwise set device defaults if options is not None: self.check_overrides(options) else: self.check_overrides() try: if self.source == 'csv': self.readings = self.readings.combine_first( read_csv_file(join(path, self.processed_data_file), self.location, self.options['frequency'], self.options['clean_na'], self.sources[self.source]['index'])) if self.readings is not None: self.__convert_names__() elif 'api' in self.source: # Get device location self.location = self.api_device.get_device_timezone() if path is None: # Not chached case if only_unprocessed: # Override dates for post-processing if self.latest_postprocessing is not None: hw_latest_postprocess = localise_date( self.latest_postprocessing, 'UTC').strftime('%Y-%m-%dT%H:%M:%S') # Override min loading date self.options['min_date'] = hw_latest_postprocess df = self.api_device.get_device_data( self.options['min_date'], self.options['max_date'], self.options['frequency'], self.options['clean_na']) # API Device is not aware of other csv index data, so make it here if 'csv' in self.sources and df is not None: df = df.reindex( df.index.rename(self.sources['csv']['index'])) # Combine it with readings if possible if df is not None: self.readings = self.readings.combine_first(df) else: # Cached case self.readings = self.readings.combine_first( read_csv_file(join(path, str(self.id) + '.csv'), self.location, self.options['frequency'], self.options['clean_na'], self.sources['csv']['index'])) except FileNotFoundError: # Handle error if 'api' in self.source: std_out( f'No cached data file found for device {self.id} in {path}. Moving on', 'WARNING') elif 'csv' in self.source: std_out(f'File not found for device {self.id} in {path}', 'ERROR') self.loaded = False except: print_exc() self.loaded = False else: if self.readings is not None: self.__check_sensors__() if max_amount is not None: self.readings = self.readings.dropna( axis=0, how='all').head(max_amount) if not self.readings.empty: # Only add metrics if there is something that can be potentially processed self.__fill_metrics__() self.loaded = True if convert_units: self.__convert_units__() else: std_out('Empty dataframe in readings', 'WARNING') finally: self.processed = False return self.loaded def __fill_metrics__(self): std_out('Checking if metrics need to be added based on hardware info') if self.hardware_description is None: std_out(f'No hardware url in device {self.id}, ignoring') return None # Now go through sensor versions and add them to the metrics if 'versions' in self.hardware_description: for version in self.hardware_description['versions']: from_date = version["from"] to_date = version["to"] # Do not add any metric if the from_date of the calibration is after the last_reading_at # as there would be nothing to process if from_date > self.api_device.last_reading_at: std_out( 'Postprocessing from_date is later than device last_reading_at', 'ERROR') return None for slot in version["ids"]: # Alphasense type - AAN 803-04 if slot.startswith('AS'): sensor_id = version["ids"][slot] as_type = config._as_sensor_codes[sensor_id[0:3]] pollutant = as_type[as_type.index('_') + 1:] if pollutant == 'OX': pollutant = 'O3' # Get working and auxiliary electrode names wen = f"ADC_{slot.strip('AS_')[:slot.index('_')]}_{slot.strip('AS_')[slot.index('_')+1]}" aen = f"ADC_{slot.strip('AS_')[:slot.index('_')]}_{slot.strip('AS_')[slot.index('_')+2]}" if pollutant not in self.metrics: # Create Metric std_out( f'Metric {pollutant} not in blueprint, ignoring.', 'WARNING') else: # Simply fill it up std_out( f'{pollutant} found in blueprint metrics, filling up with hardware info' ) self.metrics[pollutant]['kwargs']['we'] = wen self.metrics[pollutant]['kwargs']['ae'] = aen self.metrics[pollutant]['kwargs'][ 'location'] = self.location self.metrics[pollutant]['kwargs'][ 'alphasense_id'] = str(sensor_id) self.metrics[pollutant]['kwargs'][ 'from_date'] = from_date self.metrics[pollutant]['kwargs'][ 'to_date'] = to_date # Other metric types will go here else: std_out( 'No hardware versions found, ignoring additional metrics', 'WARNING') def __check_sensors__(self): remove_sensors = list() for sensor in self.sensors: if sensor not in self.readings.columns: remove_sensors.append(sensor) if remove_sensors != []: std_out(f'Removing sensors from device: {remove_sensors}', 'WARNING') for sensor_to_remove in remove_sensors: self.sensors.pop(sensor_to_remove, None) std_out(f'Device sensors after removal: {list(self.sensors.keys())}') def __convert_names__(self): rename = dict() for sensor in self.sensors: if 'id' in self.sensors[sensor] and sensor in self.readings.columns: rename[self.sensors[sensor]['id']] = sensor self.readings.rename(columns=rename, inplace=True) def __convert_units__(self): ''' Convert the units based on the UNIT_LUT and blueprint NB: what is read/written from/to the cache is not converted. The files are with original units, and then converted in the device only for the readings but never chached like so. ''' std_out('Checking if units need to be converted') for sensor in self.sensors: factor = get_units_convf(sensor, from_units=self.sensors[sensor]['units']) if factor != 1: self.readings.rename(columns={sensor: sensor + '_RAW'}, inplace=True) self.readings.loc[:, sensor] = self.readings.loc[:, sensor + '_RAW'] * factor std_out('Units check done', 'SUCCESS') def process(self, only_new=False, lmetrics=None): ''' Processes devices metrics, either added by the blueprint definition or the addition using Device.add_metric(). See help(Device.add_metric) for more information about the definition of the metrics to be added Parameters ---------- only_new: boolean False To process or not the existing channels in the Device.readings that are defined in Device.metrics lmetrics: list None List of metrics to process. If none, processes all Returns ---------- boolean True if processed ok, False otherwise ''' process_ok = True if 'metrics' not in vars(self): std_out(f'Device {self.id} has nothing to process. Skipping', 'WARNING') return process_ok std_out('---------------------------') std_out(f'Processing device {self.id}') if lmetrics is None: metrics = self.metrics else: metrics = dict([(key, self.metrics[key]) for key in lmetrics]) for metric in metrics: std_out(f'Processing {metric}') if only_new and metric in self.readings: std_out(f'Skipping. Already in device') continue # Check if the metric contains a custom from_list if 'from_list' in metrics[metric]: lazy_name = metrics[metric]['from_list'] else: lazy_name = f"scdata.device.process.{metrics[metric]['process']}" try: funct = LazyCallable(lazy_name) except ModuleNotFoundError: print_exc() process_ok &= False std_out('Problem adding lazy callable to metrics list', 'ERROR') pass return False args, kwargs = list(), dict() if 'args' in metrics[metric]: args = metrics[metric]['args'] if 'kwargs' in metrics[metric]: kwargs = metrics[metric]['kwargs'] try: self.readings[metric] = funct(self.readings, *args, **kwargs) except KeyError: # print_exc() std_out('Metric args not in dataframe', 'ERROR') process_ok = False pass if metric in self.readings: process_ok &= True if process_ok: # Latest postprocessing to latest readings if self.api_device.get_device_postprocessing() is not None: std_out('Updating postprocessing') # Add latest postprocessing rounded up with frequency so that we don't end up in # and endless loop processing only the latest data line (minute vs. second precission of the readings) latest_postprocessing = localise_date( self.readings.index[-1] + to_timedelta(self.options['frequency']), 'UTC').strftime('%Y-%m-%dT%H:%M:%S') self.api_device.postprocessing[ 'latest_postprocessing'] = latest_postprocessing std_out(f"{self.api_device.postprocessing}") std_out(f"Device {self.id} processed", "SUCCESS") self.processed = process_ok return process_ok def forward(self, chunk_size=500, dry_run=False): ''' Forwards data to another api Parameters ---------- chunk_size: int 500 Chunk size to be sent to device.post_data_to_device in question dry_run: boolean False Post the payload to the API or just return it Returns ---------- boolean True if posted ok, False otherwise ''' if self.forwarding_params is None: std_out('Empty forwarding information', 'ERROR') return False rd = dict() df = self.readings.copy().dropna(axis=0, how='all') df.rename(columns=rd, inplace=True) if df.empty: std_out('Empty dataframe, ignoring', 'WARNING') return False # Import requested handler hmod = __import__('scdata.io.device_api', fromlist=['io.device_api']) Hclass = getattr(hmod, config.connectors[self.forwarding_request]['handler']) # Create object device = Hclass(did=self.forwarding_params) post_ok = device.post_data_to_device(df, chunk_size=chunk_size, dry_run=dry_run) if post_ok: std_out(f'Posted data for {self.id}', 'SUCCESS') else: std_out(f'Error posting data for {self.id}', 'ERROR') return post_ok def add_metric(self, metric=dict()): ''' Add a metric to the device to be processed by a callable function Parameters ---------- metric: dict Empty dict Description of the metric to be added. It only adds it to Device.metrics, but does not calculate anything yet. The metric dict needs to follow the format: metric = { 'metric_name': {'process': <function_name> 'args': <iterable> 'kwargs': <**kwargs for @function_name> 'from_list': <module to load function from> } } The 'from_list' parameter is optional, and onle needed if the process is not already available in scdata.device.process. For a list of available processes call help(scdata.device.process) Example: -------- metric = {'NO2_CLEAN': {'process': 'clean_ts', 'kwargs': {'name': pollutant, 'limits': [0, 350], 'window_size': 5} }} Returns ---------- True if added metric ''' if 'metrics' not in vars(self): std_out(f'Device {self.id} has no metrics yet. Adding') self.metrics = dict() try: metricn = next(iter(metric.keys())) self.metrics[metricn] = metric[metricn] except: print_exc() return False std_out(f'Metric {metric} added to metrics', 'SUCCESS') return True def del_metric(self, metricn=''): if 'metrics' not in vars(self): return if metricn in self.metrics: self.metrics.pop(metricn, None) if metricn in self.readings.columns: self.readings.__delitem__(metricn) if metricn not in self.readings and metricn not in self.metrics: std_out(f'Metric {metricn} removed from metrics', 'SUCCESS') return True return False def export(self, path, forced_overwrite=False, file_format='csv'): ''' Exports Device.readings to file Parameters ---------- path: String Path to export file to, does not include filename. The filename will be the Device.id property forced_overwrite: boolean False Force data export in case of already existing file file_format: String 'csv' File format to export. Current supported format CSV Returns --------- True if exported ok, False otherwise ''' # Export device if file_format == 'csv': return export_csv_file(path, str(self.id), self.readings, forced_overwrite=forced_overwrite) else: std_out('Not supported format', 'ERROR') return False def post_sensors(self, dry_run=False): ''' Posts devices sensors. Only available for parent of ScApiDevice Parameters ---------- dry_run: boolean False Post the payload to the API or just return it Returns ---------- boolean True if posted ok, False otherwise ''' post_ok = True if self.sources[self.source]['handler'] != 'ScApiDevice': std_out('Only supported processing post is to SmartCitizen API', 'ERROR') return False rd = dict() df = self.readings.copy().dropna(axis=0, how='all') for col in self.readings: rd[col] = self.sensors[col]['id'] df.rename(columns=rd, inplace=True) if df.empty: std_out('Empty dataframe, ignoring', 'WARNING') return False post_ok = self.api_device.post_data_to_device(df, dry_run=dry_run) if post_ok: std_out(f'Posted data for {self.id}', 'SUCCESS') else: std_out(f'Error posting data for {self.id}', 'ERROR') return post_ok def update_postprocessing(self, dry_run=False): ''' Posts device postprocessing. Only available for parent of ScApiDevice Parameters ---------- dry_run: boolean False Post the payload to the API or just return it Returns ---------- boolean True if posted ok, False otherwise ''' post_ok = self.api_device.patch_postprocessing(dry_run=dry_run) if post_ok: std_out(f"Postprocessing posted for device {self.id}", "SUCCESS") return post_ok def post_metrics(self, with_postprocessing=False, dry_run=False): ''' Posts devices metrics. Only available for parent of ScApiDevice Parameters ---------- with_postprocessing: boolean False Post the postprocessing_attributes too dry_run: boolean False Post the payload to the API or just return it Returns ---------- boolean True if posted ok, False otherwise ''' post_ok = True if self.sources[self.source]['handler'] != 'ScApiDevice': std_out('Only supported processing post is to SmartCitizen API', 'ERROR') return False rd = dict() std_out(f"Posting metrics for device {self.id}") # Make a copy of df df = self.readings.copy().dropna(axis=0, how='all') # Get metrics to post, only the ones that have True in 'post' field and a valid ID for metric in self.metrics: if self.metrics[metric]['post'] == True: std_out(f"Adding {metric} for device {self.id}") rd[col] = self.metrics[metric]['id'] # Keep only metrics in df df = df[df.columns.intersection(list(rd.keys()))] df.rename(columns=rd, inplace=True) # If empty, avoid if df.empty: std_out('Empty dataframe, ignoring', 'WARNING') return False post_ok = self.api_device.post_data_to_device(df, dry_run=dry_run) if post_ok: std_out(f'Posted metrics for {self.id}', 'SUCCESS') else: std_out(f'Error posting metrics for {self.id}', 'ERROR') # Post info if requested. It should be updated elsewhere if with_postprocessing and post_ok: post_ok &= self.update_postprocessing(dry_run=dry_run) if post_ok: std_out(f"Metrics posted for device {self.id}", "SUCCESS") return post_ok
def get_all_econ_info(self, year): print("{} start!".format(year)) # 추가 된 코드 # 크롤링 중간에 꺼졌을 때 이어받기 위해 설정 latest_index = self._get_latest_index(year) num = len(self.stock_item_all) # print 용 변수 count = latest_index # print 용 변수 for stock_code in self.stock_item_all[latest_index:]: count += 1 code = stock_code[0] code_name = stock_code[1] print("{} 년도 ++++++++++++++ {} ++++++++++++++ {} / {}".format(year, code_name, count, num)) # 데이터가 있으면 넘어가는 로직 if self.is_exist_data(year, code): print("{} 년도 {} 데이터는 이미 존재한다!".format(year, code_name)) continue print("insert {} {}".format(year, code_name)) corp_class = self.crp_list.find_by_stock_code(code) # corp_class 가 none값 인지 확인 if corp_class: corp_code = corp_class.to_dict()['corp_code'] else: print("{} 의 corp_class 데이터가 조회되지 않습니다.".format(code_name)) continue try: """ dart.api.finance.get_single_corp(corp_code: str, bsns_year: str, reprt_code: str) corp_code: corp_code(종목코드가 아님, 공시대상회사의 고유번호(8자리)), bsns_year: 연도를(사업연도(4자리)) reprt_code: 1분기보고서 : 11013, 반기보고서 : 110123, 3분기보고서 : 11014, 사업보고서 : 11011 """ res = dart.api.finance.get_single_corp(corp_code, str(year), '11011') except NoDataReceived as e: print("{} 년도 {} ({}) 데이터는 조회되지 않습니다.".format(year, code_name, code)) continue df = DataFrame(res['list']) # 콤마 제거 # 당해 연도 df['thstrm_amount'] = df['thstrm_amount'].str.replace(',', '') # 1년 전 df['frmtrm_amount'] = df['frmtrm_amount'].str.replace(',', '') # 2년 전 df['bfefrmtrm_amount'] = df['bfefrmtrm_amount'].str.replace(',', '') # - 가 있는 컬럼은 None으로 변경 df.loc[df.thstrm_amount == '-', 'thstrm_amount'] = None df.loc[df.frmtrm_amount == '-', 'frmtrm_amount'] = None df.loc[df.bfefrmtrm_amount == '-', 'bfefrmtrm_amount'] = None # stock_code 라는 컬럼명을 code로 변경 df = df.rename(columns={'stock_code': 'code'}) # 데이터프레임에 코드명 컬럼이 없어서 새롭게 생성 df['code_name'] = code_name df.to_sql('dart', self.db_engine, if_exists='append', dtype={ 'thstrm_amount': sqlalchemy.types.BIGINT, 'frmtrm_amount': sqlalchemy.types.BIGINT, 'bfefrmtrm_amount': sqlalchemy.types.BIGINT } )