def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr): ''' Meant to feed into a Pivot requested by Mitch Turner. Aggregates the same as above but includes time and product data. ''' dat = pwunsale_tidy['Date'].tolist() pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat] print('Aggregating custom pivot for Mitch.') len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } custom_cols = ['Month','CustomerId','Customer','ProductId','Product'] customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) return customer_returns
def test_merge_datetime_index(self, box): # see gh-19038 df = DataFrame([1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"]) df.index = pd.to_datetime(df.index) on_vector = df.index.year if box is not None: on_vector = box(on_vector) expected = DataFrame( OrderedDict([ ("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018]), ]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( OrderedDict([ ("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3]), ]) ) result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected)
def test_merge_na_keys(self): data = [[1950, "A", 1.5], [1950, "B", 1.5], [1955, "B", 1.5], [1960, "B", np.nan], [1970, "B", 4.], [1950, "C", 4.], [1960, "C", np.nan], [1965, "C", 3.], [1970, "C", 4.]] frame = DataFrame(data, columns=["year", "panel", "data"]) other_data = [[1960, 'A', np.nan], [1970, 'A', np.nan], [1955, 'A', np.nan], [1965, 'A', np.nan], [1965, 'B', np.nan], [1955, 'C', np.nan]] other = DataFrame(other_data, columns=['year', 'panel', 'data']) result = frame.merge(other, how='outer') expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') expected = expected.replace(-999, np.nan) tm.assert_frame_equal(result, expected)
def calculate_weights(self, services: pd.DataFrame) -> Dict: """ calculate weights for load balancer for each service, weights are proportional to precision score but not less than MIN_WEIGHT, sum of weights for all services approximately equal to WEIGHT_SCALE if service use replication, weight will be divided by replicas number :param services: dataframe with active services (block, model, n_replicas, service) :return: dict with service as a key and weight as value """ start_time = int(time.time() * 1000) - TIME_WINDOW for block in services['block'].unique(): if block: score = self.r.xrange(block, min=start_time, max='+') score = pd.DataFrame([x for _, x in score]) score.columns = [c.decode() for c in score.columns] score['value'] = score['value'].astype(int) score['model'] = score['model'].apply(lambda x: x.decode()) score = score.groupby('model', as_index=False).mean() score = services.merge(score, on='model') score['weight'] = (score['value'] / score['value'].sum() * WEIGHT_SCALE / score['replicas'] ).fillna(0) score['weight'] = score['weight'].astype(int) score.loc[score['weight'] == 0, 'weight'] = MIN_WEIGHT return score.set_index('service').to_dict()['weight']
def alignTime(self,instruments): i = 0 mergedSeries = None for instrument in instruments : if i == 0: # first instrument do not merge i += 1 # change col name in order not to overlap mergedSeries = instruments[instrument][["Date","AdjClose"]] newColName = str("%s_Adjclose" %instrument) mergedSeries = mergedSeries.rename(columns={'AdjClose': newColName}) else : newSeries = instruments[instrument][["Date","AdjClose"]] newColName = str("%s_Adjclose" %instrument) newSeries = newSeries.rename(columns={'AdjClose': newColName}) mergedSeries = pd.merge(mergedSeries,newSeries, on="Date", how = "inner") mergedSeries.dropna() # put result into Dict, and recover Name alignedSeries = dict() for instrument in instruments : colName = str("%s_Adjclose" %instrument) alignedSeries[instrument] = mergedSeries[["Date",colName]].rename(columns={colName : 'AdjClose'}) mergedSeries = mergedSeries.set_index(mergedSeries["Date"].values) #mergedSeries.plot() #plt.show() return alignedSeries
def main(self): ''' Find the office id for each indicator. ''' ## find the data for the indicators requested ## df = DataFrame(list(DataPointComputed.objects.filter(indicator_id__in=\ self.indicator_id_list).values_list('indicator_id','campaign_id')\ .distinct()),columns=['indicator_id','campaign_id']) ## find all campaigns + office combinations office_lookup_df = DataFrame(list(Campaign.objects.all()\ .values_list('id','office_id')),columns=['campaign_id','office_id']) ## Join the two dataframes and take the distinct office, indicator ## joined_df = df.merge(office_lookup_df) unique_df = joined_df[['indicator_id','office_id']].drop_duplicates() ## iterrate throught the DF, create objects and prep for bulk_create ind_to_office_batch = [] for ix, data in unique_df.iterrows(): ind_to_office_batch.append(IndicatorToOffice(**data.to_dict())) ## delete then re-insert ## IndicatorToOffice.objects.filter(indicator_id__in = \ self.indicator_id_list).delete() IndicatorToOffice.objects.bulk_create(ind_to_office_batch)
def process_location_tree_lvl(self, location_type_id): ''' Get and process data for a particular location type ( admin level ). ''' lt_batch = [] df_columns = ['location_id', 'parent_location_id'] location_df = DataFrame(list(Location.objects .filter(location_type_id=location_type_id) .values_list('id', 'parent_location_id')),\ columns= df_columns) location_df['lvl'] = 1 # since this is a direct parent child relation merged_df = location_df.merge(self.location_tree_df\ ,left_on='location_id'\ ,right_on='parent_location_id') cleaned_merge_df = merged_df[['location_id_y', 'parent_location_id_x'\ ,'lvl_y']] cleaned_merge_df['lvl_y'] = cleaned_merge_df['lvl_y'] + 1 cleaned_merge_df.columns = self.location_tree_columns self.location_tree_df = concat([self.location_tree_df, location_df, cleaned_merge_df]) self.location_tree_df.drop_duplicates()
def test_join_dups(self): # joining dups df = concat([DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20) .reshape(10, 2), columns=['A', 'C'])], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ['x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] assert_frame_equal(dta, expected)
def create_summaries(unsaleables_by_product, pw_ytdsupp): ''' Creates useful one-look summaries for management. ''' print('*'*100) print('Creating summaries.') print('*'*100) summary_cols = ['DollarsUnsaleable|sum', 'DollarsReturned|sum', 'CasesUnsaleable|sum', 'CasesReturned|sum'] print('\n\n\nSummarizing Directors.') by_director = DataFrame(unsaleables_by_product.groupby('Director')[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False) print('Summarizing Suppliers.') by_supplier = DataFrame(unsaleables_by_product.groupby(['Director','SupplierId','Supplier'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False).reset_index(level=['Director','SupplierId','Supplier'], drop=False) print('Merging in YTD sales by supplier and deriving percent of sales.') by_supplier = by_supplier.merge(pw_ytdsupp, on='SupplierId', how='left') by_supplier['PercentSales'] = np.divide(by_supplier['DollarsUnsaleable|sum'], by_supplier['DollarSales|bysupplier']) print('Summarizing by Class.\n\n\n') by_class = DataFrame(unsaleables_by_product.groupby(['Class'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False) print('*'*100) print('Finished creating summaries.') print('*'*100) return by_supplier, by_director, by_class
def group_by_time_transform(self): dp_df_columns = ['data_date','indicator_id','location_id','value'] time_grouping = self.parsed_params['group_by_time'] # HACKK if self.parsed_params['chart_uuid'] ==\ '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be': self.parsed_params['show_missing_data'] = 1 return self.handle_polio_case_table(dp_df_columns) cols = ['data_date','indicator_id','location_id','value'] dp_df = DataFrame(list(DataPoint.objects.filter( location_id__in = self.location_ids, indicator_id__in = self.parsed_params['indicator__in'] ).values(*cols)),columns=cols) if not dp_df.empty: dp_df = self.get_time_group_series(dp_df) gb_df = DataFrame(dp_df\ .groupby(['indicator_id','time_grouping','location_id'])['value']\ .sum())\ .reset_index() return gb_df # need to look at sublocations if the data isn't available at the current level else: depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids while dp_df.empty and depth_level < max_depth: sub_location_ids = Location.objects\ .filter(parent_location_id__in=sub_location_ids)\ .values_list('id', flat=True) dp_df = DataFrame(list(DataPoint.objects.filter( location_id__in = sub_location_ids, indicator_id__in = self.parsed_params['indicator__in'] ).values(*cols)),columns=cols) depth_level += 1 dp_df = self.get_time_group_series(dp_df) if dp_df.empty: return [] location_tree_df = DataFrame(list(LocationTree.objects\ .filter(location_id__in = sub_location_ids)\ .values_list('location_id','parent_location_id')),\ columns=['location_id','parent_location_id']) merged_df = dp_df.merge(location_tree_df) filtered_df = merged_df[merged_df['parent_location_id']\ .isin(self.location_ids)] gb_df = DataFrame(filtered_df\ .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\ .sum())\ .reset_index() gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'}) return gb_df
def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust): ''' Derives intelligence out of MTC1 data on customer returns. ''' print('*'*100) print('Creating summary of returns.') print('*'*100) len_unique = lambda x: len(pd.unique(x)) agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum}, 'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum}, 'Invoice':len_unique } print('\n\n\nAggregating tidy dataset.') customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False) customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) customer_returns.drop('Customer', inplace=True, axis=1) print('Merging in YTD sales by Customer') customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left') print('Deriving returns as a percent of sales for each Customer.') customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer']) print('Merge in customer attributes.') customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left') print('Sorting in descending order on Dollars returned.') customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True) print('Reorder columns for readability.\n\n\n') reorder_cols = ['CustomerId','Customer','Returns|count', 'PercentSales','DollarSales|bycustomer', 'DollarsReturned|sum','DollarsReturned|avg', 'CasesReturned|sum','CasesReturned|avg', 'OnPremise','Latitude','Longitude'] customer_returns = customer_returns[reorder_cols] print('*'*100) print('Finished summarizing returns.') print('*'*100) return customer_returns
def get_merged_data(): with open("resources/data/ner.json") as f: ner = DataFrame(json.load(f)) with open("resources/data/publications.json") as f: publications = DataFrame(json.load(f)) with open("resources/data/sweet.json") as f: sweet_features = DataFrame(json.load(f)) return (ner.merge(publications,on=["id"])).merge(sweet_features,on=["id"]).T.to_dict().values()
def get_merged_data(solrdata): with open("ner.json") as f: ner = DataFrame(json.load(f)) with open("publications.json") as f: publications = DataFrame(json.load(f)) with open("geotopic.json") as f: sweet_features = DataFrame(json.load(f)) return (ner.merge(publications)).merge(sweet_features).T.to_dict().values()
def assemble_episodic_data(stays, diagnoses): data = { 'Icustay': stays.ICUSTAY_ID, 'Age': stays.AGE, 'Length of Stay': stays.LOS, 'Mortality': stays.MORTALITY } data.update(transform_gender(stays.GENDER)) data.update(transform_ethnicity(stays.ETHNICITY)) data['Height'] = np.nan data['Weight'] = np.nan data = DataFrame(data).set_index('Icustay') data = data[['Ethnicity', 'Gender', 'Age', 'Height', 'Weight', 'Length of Stay', 'Mortality']] return data.merge(extract_diagnosis_labels(diagnoses), left_index=True, right_index=True)
def merge(self, *args, **kwargs): result = DataFrame.merge(self, *args, **kwargs) geo_col = self._geometry_column_name if isinstance(result, DataFrame) and geo_col in result: result.__class__ = GeoDataFrame result.crs = self.crs result._geometry_column_name = geo_col result._invalidate_sindex() elif isinstance(result, DataFrame) and geo_col not in result: result.__class__ = DataFrame return result
def get_result(self): """ Get the result after processing the work log. """ if (self._data is None) or (len(self._data) == 0) or\ (self._proj is None) or (len(self._proj) == 0): return (None, None, None) task_frame = DataFrame(self._data, columns = ['line', 'date', 'hours', 'task']) proj_frame = DataFrame(self._proj, columns = ['line', 'date', 'task', 'project']) # Group projects assigned to tasks task_projs = (task_frame[['task']])\ .merge(proj_frame[['task', 'project']], how='outer', on='task')\ .drop_duplicates(['task', 'project'])\ .groupby('task').project # Filter tasks without projects task_frame = task_frame.merge(proj_frame[['task']]\ .drop_duplicates('task')) # Construct task table task_table = (task_frame[['task', 'date', 'hours']])\ .groupby(['task', 'date']).sum()\ .unstack()['hours'].fillna(0.0) # Assign tasks to projects join_frame = task_frame.merge(proj_frame, how='inner', on='task', suffixes=['_task', '_proj']) join_frame = \ (join_frame[join_frame.line_task <= join_frame.line_proj])\ .sort(columns=['line_task', 'line_proj'])\ .drop_duplicates('line_task') # Construct project table proj_table = (join_frame[['date_task', 'project', 'hours']])\ .groupby(['project', 'date_task']).sum()\ .unstack()['hours'].fillna(0.0) proj_table.columns.name = 'date' # Set totals task_table['TOTAL'] = task_table.sum(axis=1) proj_table['TOTAL'] = proj_table.sum(axis=1) task_table.ix['TOTAL'] = task_table.sum() proj_table.ix['TOTAL'] = proj_table.sum() return (task_table, proj_table, task_projs)
class CubeJoin(object): def __init__(self, cube): self.cube = cube self.data = DataFrame({}) method = getattr(self, cube.get('cube_join_type', 'none')) method() def inner(self): fields = [rel['field'] for rel in self.cube.get('relationship')] DW = DataWarehouse() for i, rel in enumerate(self.cube.get('relationship')): data = DW.get(rel['cube']).get('data') df = DataFrame(data) if i == 0: self.data = df else: self.data = self.data.merge(df, how='inner', on=fields[0]) return self.data def left(self): fields = [rel['field'] for rel in self.cube.get('relationship')] self.data = DataFrame({fields[0]: []}) DW = DataWarehouse() for rel in self.cube.get('relationship'): data = DW.get(rel['cube']) self.data = self.data.merge(DataFrame(data.get('data')), how='outer', on=fields[0]) return self.data def append(self): self.data = DataFrame({}) DW = DataWarehouse() self.data.append([DataFrame( DW.get(rel['cube']).get('data')) for rel in self.cube.get('relationship')], ignore_index=True) return self.data def none(self): return self.data
def create_indirect_links_once(df: pd.DataFrame) -> pd.DataFrame: """ This function gets a Dataframe as input. The function then merges the Dataframe with itself on given keys. The function returns the Dataframe with newly added lines that result from indirect links. """ # merge the Dataframe with itself based on keys of input study etc. and output study. # two rows match if the contents of the left side match the contents of the right side. # row 1 # input_study, input_dataset, input_version, input_variable # 1, 1, 1, 1 # matches row 2 # output_study, output_dataset, output_version, output_variable # 1, 1, 1, 1 temp = df.merge( df, right_on=["input_study", "input_dataset", "input_version", "input_variable"], left_on=["output_study", "output_dataset", "output_version", "output_variable"], ) WANTED_COLUMNS = [ "input_study_x", "input_dataset_x", "input_version_x", "input_variable_x", "output_study_y", "output_dataset_y", "output_version_y", "output_variable_y", ] # select only the columns for # input study etc. from the left Dataframe and the output study etc. from the right Dataframe temp = temp[WANTED_COLUMNS] # Rename the rows to be of the original format RENAME_COLUMNS = { "input_study_x": "input_study", "input_dataset_x": "input_dataset", "input_version_x": "input_version", "input_variable_x": "input_variable", "output_study_y": "output_study", "output_dataset_y": "output_dataset", "output_version_y": "output_version", "output_variable_y": "output_variable", } temp.rename(columns=RENAME_COLUMNS, inplace=True) # add new rows to the original Dataframe, dropping duplicates return df.append(temp).drop_duplicates().reset_index(drop=True)
def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp)
def setUp(self): super(GeoResourceTest, self).setUp() self.ts = TestSetupHelpers() self.lt = LocationType.objects.create(name='Region',admin_level=1) self.distr, created = \ LocationType.objects.get_or_create(name='District',admin_level = 2) self.planet_location_type = LocationType.objects\ .create(name = 'Planet', admin_level = 0) self.ultimate_parent = Location.objects.create( id = 1, name = 'Earth', location_code = 'Earth', location_type_id = self.planet_location_type.id ) location_df_from_csv= read_csv('rhizome/tests/_data/locations_nimroz.csv') self.ts.model_df_to_data(location_df_from_csv,Location) # make sure that the proper level is set for the locs = Location.objects.filter(parent_location_id=6) for loc in locs: loc.location_type_id = self.distr.id loc.save() parent = Location.objects.get(id=6) parent.location_type_id = self.lt.id parent.save() geo_json_df = read_csv('rhizome/tests/_data/geo_json_small.txt',delimiter = "|") location_df = DataFrame(list(Location.objects.all()\ .values_list('id','location_code')),columns=['location_id','location_code']) location_tree_df = DataFrame(list(Location.objects.all()\ .values_list('id','parent_location_id'))\ ,columns=['location_id','parent_location_id']) location_tree_df['parent_location_id'].fillna(self.ultimate_parent.id,\ inplace=True) location_tree_df['lvl'] = Series(1, index=location_tree_df.index) self.ts.model_df_to_data(location_tree_df, LocationTree) merged_df = location_df.merge(geo_json_df)[['location_id','geo_json']] self.ts.model_df_to_data(merged_df, LocationPolygon) minify_geo_json() LocationPermission.objects.create(user_id = self.ts.user.id, top_lvl_location_id = 1)
def test_metadata_propagation_indiv(self): # groupby df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) result = df.groupby('A').sum() self.check_metadata(df,result) # resample df = DataFrame(np.random.randn(1000,2), index=date_range('20130101',periods=1000,freq='s')) result = df.resample('1T') self.check_metadata(df,result) # merging with override # GH 6923 _metadata = DataFrame._metadata _finalize = DataFrame.__finalize__ np.random.seed(10) df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b']) df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd']) DataFrame._metadata = ['filename'] df1.filename = 'fname1.csv' df2.filename = 'fname2.csv' def finalize(self, other, method=None, **kwargs): for name in self._metadata: if method == 'merge': left, right = other.left, other.right value = getattr(left, name, '') + '|' + getattr(right, name, '') object.__setattr__(self, name, value) else: object.__setattr__(self, name, getattr(other, name, '')) return self DataFrame.__finalize__ = finalize result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner') self.assertEquals(result.filename,'fname1.csv|fname2.csv') DataFrame._metadata = _metadata DataFrame.__finalize__ = _finalize
def dfm_A_intersect_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame: """ A - B return the entries which in A and in B based on key cols, it is mainly used to identify duplicate entries and then update to DB :param A: :param B: :return: """ if len(B) == 0: return DataFrame(columns=list(A.columns)) B_tmp=B[key_cols].copy() B_tmp['tmp_col_duplicated'] = 'Y' dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols) # dfm_merge_by_keycols.dropna() 这里不能使用dropna,dropna只要这行中有None值,就会删除.会导致误删除. dfm_merge_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated == 'Y'] del dfm_merge_by_keycols['tmp_col_duplicated'] return dfm_merge_by_keycols
def populate_fake_dwc_data(apps, schema_editor): ''' This migration will be removed, and we will prefer the "initial_meta_data" ingetion and rely on DocTransform, RefreshMaster and AggRefresh in order to populate the datapoint_with_computed table.. however, so that we can have ample data to show on the dashboards, i will take the cartesion product of campaigns, indicators and selected locations ( provinces and LPDS ) and dump that data in to datapoint_with_computed. It would be nice to somehow set this up so that when a new developer spins up the app locally.. they can populate this 'fake' data. Maybe somethign like.. if SETTINGS.debug = True, then ingest fake data. ''' ind_df = DataFrame(list(Indicator.objects.all()\ .values_list('id','short_name','data_format')),columns = ['indicator_id','short_name','data_format']) campaign_df = DataFrame(list(Campaign.objects.all()\ .values_list('id','name')),columns = ['campaign_id','campaign_name']) country_id_list = list(Location.objects\ .filter(location_type_id = 1)\ .values_list('id',flat=True)) lpd_id_qs = list(Location.objects\ .filter(lpd_status__in=[1,2])\ .values_list('id','parent_location_id')) province_id_list = [y for x, y in lpd_id_qs] lpd_id_list = [x for x, y in lpd_id_qs] location_ids = country_id_list + province_id_list + lpd_id_list location_df = DataFrame(list(Location.objects\ .filter(id__in=location_ids)\ .values_list('id','name')),columns = ['location_id','name']) ind_df['join_col'] = 1 campaign_df['join_col'] = 1 location_df['join_col'] = 1 first_merged_df = ind_df.merge(campaign_df,on='join_col') final_merged_df = first_merged_df.merge(location_df, on='join_col') upsert_df_data(final_merged_df)
def junta_tabelas(): #locais = quebra_secoes() votos = arruma_votos() print(votos) locais = read_csv("locais_com_votacao_trabalhada.csv") #votos = read_csv("voto_secao_partido_trabalhada.csv") saida = DataFrame.merge(locais,votos, left_on="id",right_on="id",how="outer") saida = DataFrame(saida.groupby(["lat","long","aptos_por_local","local_de_votacao","zona_eleitoral_nro","bairro","endereco","secoes_eleitorais","zona_eleitoral_nome"]).sum().reset_index()) saida = saida.fillna(0) saida = saida[saida.secao != 0] saida["lat_real"] = saida["long"] saida["long"] = saida["lat"] saida["lat"] = saida["lat_real"] del saida["lat_real"] saida.to_csv("secoes_com_votacao.csv",index=False)
def process_location_tree_lvl(self, location_type_id): lt_batch = [] location_df = DataFrame(list(Location.objects\ .filter(location_type_id = location_type_id)\ .values_list('id','parent_location_id')),columns=self.location_tree_columns) merged_df = location_df.merge(self.location_tree_df ,left_on='location_id',right_on='parent_location_id') cleaned_merge_df = merged_df[['location_id_y','parent_location_id_x']] cleaned_merge_df.columns = self.location_tree_columns self.location_tree_df = concat([self.location_tree_df,location_df,\ cleaned_merge_df]) self.location_tree_df.drop_duplicates()
def mark_datapoints_with_needs_campaign(self): new_dp_df = DataFrame(list(DataPoint.objects\ .filter(source_submission_id__in = \ self.ss_ids_to_process).values())) date_series = new_dp_df['data_date'] mn_date, mx_date = min(date_series).date(), max(date_series).date() office_lookup_df = DataFrame(list(Location.objects\ .filter(id__in = list(set(new_dp_df['location_id'])))\ .values_list('id','office_id')), \ columns = ['location_id', 'office_id']) campaign_qs = Campaign.objects.filter( end_date__gte = mn_date, start_date__lte = mx_date, office_id__in = office_lookup_df\ ['office_id'].unique()) campaign_df = DataFrame(list(campaign_qs\ .values('office_id','start_date','end_date'))) if len(campaign_df) == 0: ## no campaigns match the datapoitns so update all with cj_id = -2 DataPoint.objects.filter(id__in=new_dp_df['id'].unique())\ .update(cache_job_id = -2) return dp_merged_df = new_dp_df.merge(office_lookup_df) cleaned_dp_df = dp_merged_df[['id','office_id','data_date']] dp_ids_that_need_campaign = [] dp_merged_with_campaign = cleaned_dp_df.merge(campaign_df) ## iterrate over the dps and check if there is a campaign ## for ix, r in dp_merged_with_campaign.iterrows(): ## convert date time to date r_date = r.data_date.date() if r_date >= r.end_date or r_date < r.start_date: dp_ids_that_need_campaign.append(r.id) DataPoint.objects.filter(id__in=dp_ids_that_need_campaign)\ .update(cache_job_id = -2)
def dfm_A_minus_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame: """ A - B return the entries which in A but not in B based on key cols, it is mainly used to remove duplicate entries and then insert to DB :param A: :param B: :return: """ if len(B) == 0: return A # dfmprint(A[0:10]) # dfmprint(B[0:10]) B_tmp=B[key_cols].copy() B_tmp['tmp_col_duplicated'] = 'Y' dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols) dfm_merge_by_keycols.fillna({'tmp_col_duplicated':'N'},inplace = True) dfm_dif_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated.isin(['N'])] del dfm_dif_by_keycols['tmp_col_duplicated'] return dfm_dif_by_keycols
class CubeJoin(object): def __init__(self, cube): self.cube = cube self.data = DataFrame({}) MyClient = riak.RiakClient( protocol=conf("riak")["protocol"], http_port=conf("riak")["http_port"], host=conf("riak")["host"]) self.MyBucket = MyClient.bucket(conf("riak")["bucket"]) self.MyBucket.enable_search() method = getattr(self, cube.get('cube_join_type', 'none')) method() def inner(self): fields = set([rel['field'] for rel in self.cube.get('relationship')]) self.data = concat([DataFrame(self.MyBucket.get(rel['cube']).data) for rel in self.cube.get('relationship')], keys=fields, join='inner', ignore_index=True, axis=1) return self.data def left(self): fields = [rel['field'] for rel in self.cube.get('relationship')] self.data = DataFrame({fields[0]: []}) for rel in self.cube.get('relationship'): self.data = self.data.merge(DataFrame( self.MyBucket.get(rel['cube']).data), how='outer', on=fields[0]) return self.data def append(self): self.data = DataFrame({}) self.data.append([DataFrame(self.MyBucket.get(rel['cube']).data) for rel in self.cube.get('relationship')], ignore_index=True) return self.data def none(self): return self.data
def base_transform(self): results = [] df_columns = ['id', 'indicator_id', 'campaign_id', 'location_id',\ 'value'] computed_datapoints = DataPointComputed.objects.filter( campaign__in=self.parsed_params['campaign__in'], location__in=self.location_ids, indicator__in=self.parsed_params['indicator__in']) dwc_df = DataFrame(list(computed_datapoints.values_list(*df_columns)),\ columns=df_columns) # do an inner join on the filter indicator if self.parsed_params['filter_indicator'] and self.parsed_params['filter_value']: merge_columns = ['campaign_id', 'location_id'] indicator_id = Indicator.objects.get(short_name = self.parsed_params['filter_indicator']) filter_value_list = [self.parsed_params['filter_value']] if filter_value_list == ['-1']: ## this means "show all classes" filter_value_list = [1,2,3] ## this only works for LPDS... this should be -- ## IndicatorClassMap.objects.filter(indicator = indicator)\ ## .values_list(enum_value, flat = True) filter_datapoints = DataPointComputed.objects.filter( campaign__in=self.parsed_params['campaign__in'], location__in=self.location_ids, indicator_id=indicator_id, value__in = filter_value_list ) filter_df =DataFrame(list(filter_datapoints.values_list(*merge_columns)),\ columns=merge_columns) dwc_df = dwc_df.merge(filter_df, how='inner', on=merge_columns) ## now only show the locations that match that filter.. location_ids_in_filter = set(filter_df['location_id']) self.location_ids = set(self.location_ids)\ .intersection(location_ids_in_filter) dwc_df = dwc_df.apply(self.add_class_indicator_val, axis=1) return dwc_df
def shift_data(filtereddf: pd.DataFrame, locdf: pd.DataFrame, filename: str) -> pd.DataFrame: """ This function merges the filtered weather data pandas DataFrame and the location dataframe, and shift the data for 6 months if the location is in the southern Hemisphere. Return the merged and shift pandas DataFrame Inputs: ========== filtereddf: pandas DataFrame filtereddf from datafiltering() locdf: pandas DataFrame location datafrane from read_history() filename: str name of the file to be stored """ # merge the dataframe overall_df = filtereddf.merge(locdf, how='inner', on='stn') # find the data with latitude < 0 south_ind = overall_df['LAT'] < 0.0 # shift the series for ind in range(1, 7): for txt in ['tmp', 'dew', 'stp', 'wpd', 'prec', 'sndp']: for suffix in ['mean', 'max', 'min']: now_mn = ''.join([txt, '%02i' % ind, suffix]) fut_mn = ''.join([txt, '%02i' % (ind + 6), suffix]) temp_series = overall_df.loc[south_ind, now_mn] overall_df.loc[south_ind, now_mn] = overall_df.loc[south_ind, fut_mn] overall_df.loc[south_ind, fut_mn] = temp_series # save the data overall_df.to_csv(filename) return overall_df
def add_within_category_edges(nodes: pd.DataFrame, edges: pd.DataFrame): # make edges among nodes of same category self_join = nodes.merge(nodes, on="category") self_join = self_join[self_join.id_x.ne(self_join.id_y)] one_edge_per_category = self_join.groupby("id_x").first().reset_index() # append them to actual edges after reshaping within_category_edges = one_edge_per_category[["id_x", "id_y", "category"]] within_category_edges.rename( columns={ "id_x": "from", "id_y": "to", "category": "category_source" }, inplace=True, ) within_category_edges["category_target"] = within_category_edges[ "category_source"] within_category_edges["edge_count"] = 1 within_category_edges["ppmi"] = 0.5 return edges.append(within_category_edges)
def get_clade(sample_variants: pd.DataFrame, clades: pd.DataFrame) -> str: # special case: wildtype (should have . as ref in all its "mutations") # so, if we don't find any mutation in the same positions, we assume it's wildtype wt = clades.query("ref == '.'") wt_name = wt.clade.unique()[0] if len(wt.merge(sample_variants, on=["pos"])) == 0: return wt_name # count how many mutations each clade has clade_nmutations = (clades.query("clade != @wt_name").groupby( "clade").size().to_frame("n").reset_index()) selected_clade = ("None", 0) for t in clade_nmutations.itertuples(): name = t.clade matched = len(sample_variants.merge(clades.query("clade == @name"))) # if we find all the mutations of this clade and the number of mutations # is higher than the current selected clade, choose this clade if matched == t.n and t.n > selected_clade[1]: selected_clade = (name, t.n) return selected_clade[0]
def _fill_hist_columns(self, ob_df: pd.DataFrame) -> pd.DataFrame: if len(self.known_observations_data_frame) > 0: ob_df = ob_df.drop( columns=[ self.project_config.hist_view_column_name, self.project_config.hist_output_column_name, ], errors="ignore", ) ob_df = ob_df.merge( self.hist_data_frame, how="left", left_on=[ self.project_config.user_column.name, self.project_config.item_column.name, ], right_index=True, ).fillna(0) else: ob_df[self.project_config.hist_view_column_name] = 0 ob_df[self.project_config.hist_output_column_name] = 0 return ob_df
def add_department_info( data: pd.DataFrame, left_on: str = "dept_name", right_on: str = "alias", match_missing: bool = True, ) -> pd.DataFrame: """ Add department info to the input data. Parameters ---------- data : The input dataframe. left_on : The column in the input data to merge on right_on : The column in the dept info data to merge on match_missing : Whether to attempt to match missing departments. """ # Load the department info with aliases and subitems dept_info = load_city_departments(include_aliases=True, include_line_items=True) # Merge into the info data = data.merge( dept_info, left_on=left_on, right_on=right_on, how="left", validate="1:1", suffixes=("_raw", ""), ) # Match missing departments if match_missing: data = match_missing_departments(data) return data
def pipe_vaccinations_csv(self, df: pd.DataFrame, df_iso: pd.DataFrame) -> pd.DataFrame: return (df.merge(df_iso, on="location").rename( columns={ "new_vaccinations_smoothed": "daily_vaccinations", "new_vaccinations_smoothed_per_million": "daily_vaccinations_per_million", "new_vaccinations": "daily_vaccinations_raw", })[[ 'location', 'iso_code', 'date', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'daily_vaccinations_raw', 'daily_vaccinations', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million', ]])
def combine_data(df_covid: pd.DataFrame, df_population: pd.DataFrame)\ -> pd.DataFrame: """ Function to combine covid19 data with population data using left join on "fips" column Explanation: We are applying the left join because we need latest population estimate data from df_population. The combined dataframe can later be used for generating the stats Parameter: --------- df_covid: pd.DataFrame object with processed New York Times COVID-19 Data having columns: "fips": string "date": datetime64[ns] "cases": float "deaths": float df_population: pd.DataFrame object with Population Estimate Data 2019 having columns: "fips": string "POPESTIMATE2019": float Returns: ------- df_combined: pd.DataFrame object with combined data having columns: "fips": string "date": datetime64[ns] "cases": float "deaths": float "POPESTIMATE2019": float """ # df_combined = df_covid.merge(df_population, on="fips", how="left") feature_list = ["fips", "date", "cases", "deaths", "POPESTIMATE2019"] return df_combined[feature_list]
def predict(self, validation_x: pd.DataFrame, validation_y: pd.Series): if 'per_raisha_baseline' in str.lower(self.model_name): validation_x = validation_x.merge(self.per_raisha, left_on='raisha', right_index=True) validation_x.index = validation_x.sample_id predictions = validation_x.predictions else: validation_x = validation_x[self.features] predictions = self.model.predict(validation_x) validation_y.name = 'labels' predictions = pd.Series(predictions, index=validation_y.index, name='predictions') if predictions.dtype == float: # regression- create bins to measure the F-score bin_prediction, bin_test_y = utils.create_bin_columns(predictions, validation_y) four_bin_prediction, four_bin_test_y = utils.create_4_bin_columns(predictions, validation_y) else: bin_prediction, bin_test_y = pd.Series(name='bin_prediction'), pd.Series(name='bin_label') four_bin_prediction, four_bin_test_y =\ pd.Series(name='four_bin_prediction'), pd.Series(name='four_bin_label') predictions = pd.DataFrame(predictions).join(validation_y).join(bin_test_y).join(bin_prediction) predictions = predictions.join(four_bin_test_y).join(four_bin_prediction) return predictions
def join_dates(df: pd.DataFrame): """ Функция объединения близких дат в одну. :param df: Таблица данных по одному клиенту :return: Таблица данных со схлопнутыми значениями """ df = df.sort_values(by=['date'], ascending=[True]) unique_dates = pd.DataFrame(df['date'].unique(), columns=['date']) # Возвращаем оригинальную таблицу, если была всего одна дата if len(unique_dates) == 1: return df unique_dates['new_date'] = unique_dates['date'] dates_list = list(unique_dates['date']) for i in range(0, len(dates_list) - 1, 2): if dates_list[i] == dates_list[i + 1] - timedelta(days=1): unique_dates.iat[i, 1] = dates_list[i + 1] elif dates_list[i] == dates_list[i + 1] - timedelta(days=2): unique_dates.iat[i, 1] = dates_list[i + 1] new_dates_list = list(unique_dates['new_date']) for i in range(len(new_dates_list) - 1, 0, -1): if new_dates_list[i] == new_dates_list[i - 1] + timedelta(days=1): unique_dates.iat[i, 1] = new_dates_list[i - 1] df = df.merge(unique_dates, on='date') print(unique_dates) print(len(list(unique_dates['date'].unique()))) print(len(list(unique_dates['new_date'].unique()))) df['date'] = df['new_date'] df = df.groupby(['date', 'product', 'client']).sum().reset_index() return df
def rank_cumulative_change(df: pd.DataFrame, timeframe: Timeframe): cum_sum = defaultdict(float) # print(df) for date in filter(lambda k: k in df.columns, timeframe.all_dates()): for code, price_change in df[date].fillna(0.0).iteritems(): cum_sum[code] += price_change rank = pd.Series(cum_sum).rank(method="first", ascending=False) df[date] = rank all_available_dates = df.columns avgs = df.mean(axis=1) # NB: do this BEFORE adding columns... assert len(avgs) == len(df) df["x"] = all_available_dates[-1] df["y"] = df[all_available_dates[-1]] bins = ["top", "bin2", "bin3", "bin4", "bin5", "bottom"] average_rank_binned = pd.cut(avgs, len(bins), bins) assert len(average_rank_binned) == len(df) df["bin"] = average_rank_binned df["asx_code"] = df.index stock_sector_df = ( stocks_by_sector() ) # make one DB call (cached) rather than lots of round-trips # print(stock_sector_df) stock_sector_df = stock_sector_df.set_index("asx_code") # print(df.index) df = df.merge( stock_sector_df, left_index=True, right_on="asx_code" ) # NB: this merge will lose rows: those that dont have a sector eg. ETF's df = pd.melt( df, id_vars=["asx_code", "bin", "sector_name", "x", "y"], var_name="date", value_name="rank", value_vars=all_available_dates, ) df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") df["x"] = pd.to_datetime(df["x"], format="%Y-%m-%d") return df
def get_metadata(moa_df: pd.DataFrame, image_df: pd.DataFrame) -> pd.DataFrame: """Merges and preprocesses metadata files. Reads the image and moa metadata dataframes, creates the site column, merges the metadata and fills missing values with null. Returns : pandas.DataFrame The processed metadata DataFrame """ image_df["Image_Metadata_Site"] = image_df.Image_FileName_DAPI.transform( lambda x: int(re.search("_s[1-4]_", x).group()[2]) # type: ignore ) return (image_df.merge( moa_df, how="left", left_on=[ "Image_Metadata_Compound", "Image_Metadata_Concentration", ], right_on=["compound", "concentration"], ).drop(columns=["compound", "concentration"]).fillna("null"))
def combine_clusters(data, centroids, stages, cluster_combine, cluster_col="cluster"): ratio = cluster_combine['ratio'] for stage in stages.keys(): if stage in centroids.keys(): stage_vars = stages[stage] for key in ratio.keys(): if key in stage_vars: col = cluster_col + "_" + stage cols = [key, col] df = DataFrame(centroids[stage][cols]) df['dummy'] = 1 df = df.merge(df, on=['dummy'], how="outer") df1 = df[df[col + "_x"] != df[col + "_y"]] df = df1[df1[key + "_x"] > df1[key + "_y"]] df['ratio'] = df[key + "_x"] / df[key + "_y"] df1['ratio'] = df1[key + "_x"] / df1[key + "_y"] cols = [col + "_x", 'ratio'] df = df[cols].groupby([col + "_x" ]).min().reset_index(drop=False) df['combine'] = df['ratio'] < ratio[key] df1 = df1.merge(df, on=[col + "_x", "ratio"], how="outer") df1 = df1.drop(["dummy", key + "_y", key + "_x"], axis=1) df1.columns = [ sub(pattern="_x$", repl="", string=x) for x in df1.columns.tolist() ] df1[col] = df1[col].apply(str) data[col] = data[col].apply(str) data = data.merge(df1, on=[col], how="outer") condition = data['combine'].apply(type) == float data['combine'][condition] = False data[col][~condition] = data[col + "_y"][~condition] data = data.drop([col + "_y", "combine"], axis=1) return data
def populate_fake_dwc_data(apps, schema_editor): ''' This migration will be removed, and we will prefer the "initial_meta_data" ingetion and rely on DocTransform, RefreshMaster and AggRefresh in order to populate the datapoint_with_computed table.. however, so that we can have ample data to show on the dashboards, i will take the cartesion product of campaigns, indicators and selected locations ( provinces and LPDS ) and dump that data in to datapoint_with_computed. It would be nice to somehow set this up so that when a new developer spins up the app locally.. they can populate this 'fake' data. Maybe somethign like.. if SETTINGS.debug = True, then ingest fake data. ''' document = Document.objects.create(doc_title='Initial FAKE Data Load') ind_df = DataFrame(list(Indicator.objects.all()\ .values_list('id','short_name','data_format')),columns = ['indicator_id','short_name','data_format']) campaign_df = DataFrame(list(Campaign.objects.all()\ .values_list('id','name')),columns = ['campaign_id','campaign_name']) country_id_list = list(Location.objects\ .filter(location_type_id = 1)\ .values_list('id',flat=True)) location_df = DataFrame(list(Location.objects\ .filter(location_type_id__lte = 3)\ .values_list('id','name')),columns = ['location_id','name']) ind_df['join_col'] = 1 campaign_df['join_col'] = 1 location_df['join_col'] = 1 first_merged_df = ind_df.merge(campaign_df, on='join_col') final_merged_df = first_merged_df.merge(location_df, on='join_col') upsert_df_data(final_merged_df, document.id)
def pipe_capita(self, df: pd.DataFrame) -> pd.DataFrame: logger.info("Adding per-capita variables") # Get data df_subnational = pd.read_csv( self.inputs.population_sub, usecols=["location", "population"] ) pop = self.get_population(df_subnational) df = df.merge(pop, on="location") # Get covered countries locations = df.location.unique() ncountries = df_subnational.location.tolist() + list(self.aggregates.keys()) self._countries_covered = list(filter(lambda x: x not in ncountries, locations)) # Obtain per-capita metrics df = df.assign( total_vaccinations_per_hundred=( df.total_vaccinations * 100 / df.population ).round(2), people_vaccinated_per_hundred=( df.people_vaccinated * 100 / df.population ).round(2), people_fully_vaccinated_per_hundred=( df.people_fully_vaccinated * 100 / df.population ).round(2), total_boosters_per_hundred=(df.total_boosters * 100 / df.population).round( 2 ), new_vaccinations_smoothed_per_million=( df.new_vaccinations_smoothed * 1000000 / df.population ).round(), ) df.loc[:, "people_fully_vaccinated"] = df.people_fully_vaccinated.replace( {0: pd.NA} ) df.loc[ df.people_fully_vaccinated.isnull(), "people_fully_vaccinated_per_hundred" ] = pd.NA df.loc[:, "total_boosters"] = df.total_boosters.replace({0: pd.NA}) df.loc[df.total_boosters.isnull(), "total_boosters_per_hundred"] = pd.NA return df.drop(columns=["population"])
def assemble_episodic_data(stays, diagnoses): data = { 'Icustay': stays.ICUSTAY_ID, 'Age': stays.AGE, 'Length of Stay': stays.LOS, 'Mortality': stays.MORTALITY } # update: adds element(s) to the dict if key new, otherwise updates value data.update(transform_gender(stays.GENDER)) data.update(transform_ethnicity(stays.ETHNICITY)) data.update(transform_insurance(stays.INSURANCE)) data['Height'] = np.nan data['Weight'] = np.nan data = DataFrame(data).set_index('Icustay') # reorder columns data = data[[ 'Ethnicity', 'Gender', 'Insurance', 'Age', 'Height', 'Weight', 'Length of Stay', 'Mortality' ]] return data.merge(extract_diagnosis_labels(diagnoses), left_index=True, right_index=True)
def validate_population_matches_data(population_df: pd.DataFrame, square_df: pd.DataFrame) -> None: """ Validates that population estimate demographics match the square. A mismatch is possible when the population estimates are incorrect or when a modeler tries to run ST-GPR with demographics that are not present in the population estimates. """ merged_df = square_df.merge(population_df, on=columns.DEMOGRAPHICS) if len(merged_df) != len(square_df): square_indices = square_df.set_index(columns.DEMOGRAPHICS).index merged_indices = merged_df.set_index(columns.DEMOGRAPHICS).index missing_rows = square_df[~square_indices.isin(merged_indices)] sample_missing_row = missing_rows[columns.DEMOGRAPHICS].iloc[0] raise ValueError( 'There is a mismatch between the population estimate demographics ' 'and the square. The population estimates have ' f'{len(population_df)} rows, and the square has {len(square_df)} ' 'rows. After merging population estimates with the square, there ' f'are {len(merged_df)} rows. An example of a row that is present ' 'in the square but is missing from the population estimates is ' f'{sample_missing_row.to_dict()}')
def n_fold_fit(self,train_data,cols,cate_col,test_data=None,label_col='Label',is_pred=True): #train by k_fold result_data=DataFrame() if is_pred: result_data['weight']=[0]*test_data.shape[0] fea_filter =[] n_split=10 rank=0 k=StratifiedKFold(n_splits=n_split,random_state=self.random_state,shuffle=True) all_feature_important=DataFrame() all_feature_important['feature']=cols for train_idx,test_idx in tqdm(k.split(train_data[cols],train_data[label_col]),desc='k_split_fitting'): X_train=train_data[cols].loc[train_idx] X_vail=train_data[cols].loc[test_idx] y_train=train_data[[label_col]].loc[train_idx] y_vail=train_data[[label_col]].loc[test_idx] if is_pred: result_,zero_fea,feature_important=self.model_fit(X_train=X_train,y_train=y_train,X_vail=X_vail,y_vail=y_vail,test_data=test_data[cols],cate_fea=cate_col,is_pred=is_pred) result_data['result_'+str(rank)]=result_['result'] result_data['weight_'+str(rank)]=result_['weight'] result_data['weight']+=result_['weight'] del result_ gc.collect() if not is_pred: zero_fea,feature_important=self.model_fit(X_train=X_train,y_train=y_train,X_vail=X_vail,y_vail=y_vail,cate_fea=cate_col,is_pred=is_pred) feature_important.columns=['feature']+[str(col)+'_'+str(rank) for col in feature_important.columns.tolist()[1:]] all_feature_important=all_feature_important.merge(feature_important,'left',on=['feature']) fea_filter.append(zero_fea) rank+=1 np.save(self.save_folder+'zero_feature',fea_filter) return result_data,n_split,all_feature_important,fea_filter
def group_features( df: pd.DataFrame, statistics: pd.DataFrame, column: str, group_columns: Iterable[str], dtype=np.float32, ) -> pd.DataFrame: res = df.merge(statistics, how="left", left_on=group_columns, right_index=True) eps = np.finfo(dtype).eps for statistic_column in statistics.columns: ratio_col = f"{statistic_column}_ratio" diff_col = f"{statistic_column}_diff" # Prevent division-by-zero error res[ratio_col] = res[column] / res[statistic_column].replace(0, eps) res[diff_col] = res[column] - res[statistic_column] res[statistic_column] = res[statistic_column].astype(dtype) res[ratio_col] = res[ratio_col].astype(dtype) res[diff_col] = res[diff_col].astype(dtype) return res
def create_master_table( shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame ) -> pd.DataFrame: """Combines all data to create a master table. Args: shuttles: Preprocessed data for shuttles. companies: Preprocessed data for companies. reviews: Source data for reviews. Returns: Master table. """ rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id") with_companies = rated_shuttles.merge( companies, left_on="company_id", right_on="id" ) master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1) master_table = master_table.dropna() return master_table
def get_pairs_dataset( dataset: pd.DataFrame = None, task: str = '', corpus: str = '', query_limit: int = 0 ) -> pd.DataFrame: projection = get_task_dataset_projection(task) # Query only non rejected documents query = {'rejected': False} df_task_dataset = load_dataframe_from_mongodb( database_name=corpus, collection_name=f"normalized_clear", query=query, projection=projection, sort_by='creation_ts', query_limit=query_limit ) df_task_dataset['bug_id'] = pd.to_numeric(df_task_dataset['bug_id']) dataset_merged = dataset.merge(df_task_dataset, how='cross', suffixes=('_left', '_right')) return dataset_merged
def add_aggregated_columns( df: pd.DataFrame, group_parameters: Dict[str, str] = COLUMNS_GROUPED_BY, groupby_aggregators: List = GROUPBY_AGGREGATORS, columns_to_replace: List[str] = None) -> (pd.DataFrame, List[str]): """ Create aggregated columns to deal with missing values and non-numerical values :param df: input table :param group_parameters: parameter and column to group for. :param groupby_aggregators: aggregate function to use :param columns_to_replace: original columns to be replaced with grouped values :return df: dataframe with new aggregated columns :return column_names: names of added aggregated columns """ aggregated_column_names = [] for key, value in group_parameters.items(): df_grp = df.loc[:, [key, value]].dropna(axis=0).groupby( value, as_index=False)[key].agg(groupby_aggregators) column_names = [ f"{value}_{AGGREGATOR_COLUMNS[aggregator]}_{key}" for aggregator in groupby_aggregators ] df = df.merge(df_grp, on=value, how='left') aggregated_column_names += column_names df.rename(columns=dict( zip([ AGGREGATOR_COLUMNS[aggregator] for aggregator in groupby_aggregators ], column_names)), inplace=True) # drop at the end so order in group_parameters not important if columns_to_replace is not None: df.drop(labels=[c for c in columns_to_replace], axis=1, inplace=True) df.rename(columns={ 'property_subtype_median_facades_number': "facades_number" }, inplace=True) #24/11/20 fast fix return df, column_names
def merge_custom_inputs_onto_square( square_df: pd.DataFrame, custom_covariates_df: pd.DataFrame, custom_stage_1_df: pd.DataFrame) -> pd.DataFrame: """ Adds custom covariates or custom stage 1 onto square. Args: square_df: the square dataframe custom_covariates_df: a possibly empty custom covariates dataframe custom_stage_1_df: a possibly empty custom stage 1 dataframe Returns: Square dataframe with custom covariates or custom stage 1 merged on Raises: ValueError: if custom covariates or custom stage 1 is not square """ if custom_covariates_df is None and custom_stage_1_df is None: return square_df to_join = (custom_covariates_df if custom_covariates_df is not None else custom_stage_1_df) square_with_custom_df = square_df.merge(to_join, on=columns.DEMOGRAPHICS) if len(square_with_custom_df) != len(square_df): square_indices = square_df.set_index(columns.DEMOGRAPHICS).index merged_indices = square_with_custom_df.set_index( columns.DEMOGRAPHICS).index missing_rows = square_df[~square_indices.isin(merged_indices)] sample_missing_row = missing_rows[columns.DEMOGRAPHICS].iloc[0] raise ValueError( 'Custom inputs are not square: your custom inputs have ' f'{len(to_join)} rows, and the square has {len(square_df)} rows. ' 'After merging custom inputs with the square, there are ' f'{len(square_with_custom_df)} rows. An example of a row that is ' 'present in the square but is missing from your custom inputs is ' f'{sample_missing_row.to_dict()}') return square_with_custom_df
def stat_fea(self, df: DataFrame, cate_fea_list: list, num_fea_list: list, data_sign: str = '', agg_param=['mean', 'sum', 'std'], is_format_cate_input=False, is_save_df=True): ''' :param cate_fea_list: input_format=[[],[],[]] :param data_sign: give fea data sign,default='' ''' if is_format_cate_input: cate_fea_list = [[col] for col in cate_fea_list] cate_len = len(cate_fea_list) stat_fea_list = [] for cate_fea in tqdm(cate_fea_list, desc='by cate stat'): cate_len -= 1 by_agg_data = DataFrame( df.groupby(cate_fea)[num_fea_list].agg( agg_param)).reset_index() for num_fea in tqdm(num_fea_list, desc='_'.join(cate_fea) + '_stat_num_fea' + ' rest:' + str(cate_len)): agg_cols = [ data_sign + '_by_' + '_'.join(cate_fea) + '_on_' + num_fea + '_' + agg_operator for agg_operator in agg_param ] agg_data_ = by_agg_data[num_fea] agg_ = DataFrame(data=agg_data_.values, columns=agg_cols) agg_[cate_fea] = by_agg_data[cate_fea] if is_save_df: df = df.merge(agg_, 'left', on=cate_fea) else: df = agg_ stat_fea_list += agg_cols return df, stat_fea_list
def execute(self, context): from norm.engine import QuantifiedLambda if not isinstance(self.lam.cloned_from, QuantifiedLambda): inp = self.lam.cloned_from.data else: inp = self.lam.cloned_from.execute(context) if isinstance(inp, (DataFrame, Series)): if inp.index.name == self.lam.VAR_OID: inp = inp.reset_index() elif isinstance(inp, Index): inp = DataFrame(data=inp) equal_cols = list(self.equalities.items()) left_col, right_col = equal_cols.pop() to_merge = self.prepare_to_merge() joined = inp.merge(to_merge, left_on=left_col, right_on=self.outputs.get(right_col, right_col), how='left') if right_col not in self.outputs: joined = joined.drop(columns=[right_col]) if self.lam.VAR_OID not in joined.columns: joined.index.name = self.lam.VAR_OID else: joined = joined.set_index(self.lam.VAR_OID) condition = ' & '.join('({} == {})'.format( left_col, self.outputs.get(right_col, right_col)) for left_col, right_col in equal_cols) if condition != '': results = joined.query(condition) else: results = joined results = results.drop(columns=[ right_col for left_col, right_col in equal_cols if right_col not in self.outputs ]) self.lam.data = results return results
def accrete( df: pd.DataFrame, accrete_group_by: list, accretion_cols: (str, tuple), accretion_sep: str = " ", ) -> tuple: """ Groups the dataframe by the passed group_by values and then combines text values in the accretion columns. Args: df: A DataFrame. accrete_group_by: A list of columns to group by. accretion_cols: The columns you want to accrete on within groups created by accrete_group_by. accretion_sep: A string indicating how you want the combined string values to be separated. Returns: The transformed DataFrame, and a metadata dictionary. """ accretion_cols = u.tuplify(accretion_cols) md = u.gen_empty_md_df(df.columns) for c in accretion_cols: df[c] = df[c].fillna("") df[c] = df[c].astype(str) result = df.groupby(accrete_group_by)[c].apply( accretion_sep.join).reset_index() df = df.merge(result, on=accrete_group_by, suffixes=("", "_x")) cx = c + "_x" md[c] = (df[c] != df[cx]).sum() df[c] = df[cx] df.drop(columns=cx, inplace=True) df[c] = df[c].str.strip() df[c] = df[c].apply( lambda x: x if len(x) > 0 and x[-1] != accretion_sep else x[:-1]) df[c] = df[c].replace("", nan) return df, {"metadata": md}
def setUp(self): super(GeoResourceTest, self).setUp() self.ts = TestSetupHelpers() self.lt = LocationType.objects.create(name='Region', admin_level=1) self.distr, created = \ LocationType.objects.get_or_create(name='District',admin_level = 2) self.o = self.ts.create_arbitrary_office() location_df_from_csv = read_csv( 'rhizome/tests/_data/locations_nimroz.csv') locations = self.ts.model_df_to_data(location_df_from_csv, Location) # make sure that the proper level is set for the locs = Location.objects.filter(parent_location_id=6) for loc in locs: loc.location_type_id = self.distr.id loc.save() parent = Location.objects.get(id=6) parent.location_type_id = self.lt.id parent.save() geo_json_df = read_csv('rhizome/tests/_data/geo_json_small.txt', delimiter="|") location_df = DataFrame(list(Location.objects.all()\ .values_list('id','location_code')),columns=['location_id','location_code']) location_tree_df = DataFrame(list(Location.objects.all()\ .values_list('id','parent_location_id')),columns=['location_id','parent_location_id']) location_tree_df['lvl'] = Series(1, index=location_tree_df.index) location_tree = self.ts.model_df_to_data(location_tree_df, LocationTree) merged_df = location_df.merge(geo_json_df)[['location_id', 'geo_json']] self.ts.model_df_to_data(merged_df, LocationPolygon) minify_geo_json() LocationPermission.objects.create(user_id=self.ts.user.id, top_lvl_location_id=1)
def read_unemployment_data( date_range: pd.DataFrame, external_data_path: str = "./external_data" ) -> pd.DataFrame: files: Dict[str, int] = { "CA.csv": 0, "TX.csv": 1, "WI.csv": 2, } unemployment: pd.DataFrame = pd.DataFrame() with timer("load unemployment data"): if os.path.exists(f"{external_data_path}/unemployment"): for file_name, state_id in files.items(): _tmp_unemployment = pd.read_csv( f"{external_data_path}/unemployment/{file_name}" ) _tmp_unemployment["date"] = pd.to_datetime( _tmp_unemployment["DATE"] ).dt.strftime("%Y-%m-%d") _tmp_unemployment.drop("DATE", axis=1, inplace=True) _tmp_unemployment.rename( {"{}UR".format(file_name.replace(".csv", "")): "fe_unemployment"}, axis=1, inplace=True, ) _tmp_unemployment = date_range.merge( _tmp_unemployment, on="date", how="left" ) _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[ "fe_unemployment" ].interpolate() _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[ "fe_unemployment" ].fillna(method="bfill") _tmp_unemployment["state_id"] = state_id unemployment = pd.concat([unemployment, _tmp_unemployment], axis=0) del _tmp_unemployment return unemployment
def optimize_prices( initial_price_list: pandas.DataFrame, shop_features_with_clusters: pandas.DataFrame) -> pandas.DataFrame: """ This pricing function has been made very simple as the main purpose is to serve the use case example. In real life this would typically be replaced by a more flexible price optimization engine with several parameters, such as that of ActiveViam. Args: initial_price_list: The initial SellingPrice list sotre_features_with_clusters: A dataframe containing informations about the stores, including their cluster Returns: A new SellingPrice list with an optimized price stores with low competition have increased prices while those with high competition have competitive prices. """ new_price_list = initial_price_list.merge(shop_features_with_clusters, left_on="ShopId", right_on="ShopId") new_price_list.loc[(new_price_list["Cluster"] == 0), "SellingPrice"] = (new_price_list["SellingPrice"] * 1.07) new_price_list.loc[(new_price_list["Cluster"] == 1), "SellingPrice"] = (new_price_list["SellingPrice"] * 1.3) new_price_list.loc[(new_price_list["Cluster"] == 2), "SellingPrice"] = (new_price_list["SellingPrice"] * 0.95) new_price_list.loc[(new_price_list["Cluster"] == 3), "SellingPrice"] = (new_price_list["SellingPrice"] * 1.02) new_price_list.loc[(new_price_list["Cluster"] == 4), "SellingPrice"] = (new_price_list["SellingPrice"] * 1.07) return new_price_list[[ "ProductId", "ShopId", "SellingPrice", "PurchasePrice", "Quantity" ]]
def flag_imputed_data(statcast_df: pd.DataFrame) -> pd.DataFrame: """Function to flag possibly imputed data as a result of no-nulls approach (see: https://tht.fangraphs.com/43416-2/) For derivation of values see pybaseball/EXAMPLES/imputed_derivation.ipynb Note that this imputation only occured with TrackMan, not present in Hawk-Eye data (beyond 2020) Args: statcast_df (pd.DataFrame): Dataframe loaded via statcast.py, statcast_batter.py, or statcast_pitcher.py Returns: pd.DataFrame: Copy of original dataframe with "possible_imputation" flag """ ParameterSet = namedtuple('ParameterSet', ["ev", "angle", "bb_type"]) impute_combinations = [] # pop-ups impute_combinations.append(ParameterSet(ev=80.0, angle=69.0, bb_type="popup")) # Flyout impute_combinations.append(ParameterSet(ev=89.2, angle=39.0, bb_type="fly_ball")) impute_combinations.append(ParameterSet(ev=102.8, angle=30.0, bb_type="fly_ball")) # Line Drive impute_combinations.append(ParameterSet(ev=90.4, angle=15.0, bb_type="line_drive")) impute_combinations.append(ParameterSet(ev=91.1, angle=18.0, bb_type="line_drive")) # Ground balls impute_combinations.append(ParameterSet(ev=82.9, angle=-21.0, bb_type="ground_ball")) impute_combinations.append(ParameterSet(ev=90.3, angle=-17.0, bb_type="ground_ball")) df_imputations = pd.DataFrame(data=impute_combinations) df_imputations["possible_imputation"] = True df_return = statcast_df.merge(df_imputations, how="left", left_on=["launch_speed", "launch_angle", "bb_type"], right_on=["ev", "angle", "bb_type"]) # Change NaNs to false for boolean consistency df_return["possible_imputation"] = df_return["possible_imputation"].fillna(False) df_return = df_return.drop(["ev", "angle"], axis=1) return df_return
def normalize_by_pathway(self, pathway_feature, level=3): """ pathway_feature can be n_protein or sequence_lenght """ self.set_pathway_info() pathway_factor= {c.name: c.db_data[pathway_feature] for c in self.ideograms[level-1].chromosomes} nLevels= self.biodb_selector.getLevelCount() df_lengths= DataFrame(pathway_factor.items(), index= range(len(pathway_factor)), columns=["Level %s" % level,"Length"]) #return df_lengths ## by merging acording to the lengths dataframe, we also ## filter the main dataframe in the meanwhile df_merged= df_lengths.merge(self.data_frame, on= "Level %s" % level) ### columns have one extra level for accession and one more ### for the lengths in df_merged df_merged[self.data_frame.columns[nLevels+1:]]= df_merged[self.data_frame.columns[nLevels+1:]].divide(df_merged['Length'].values, axis= 0) df_normalized= df_merged[self.data_frame.columns] self.update_ideograms_by_dataframe(df_normalized) self.update_dataframe()
def get_services_weights(metrics: pd.Series, services: pd.DataFrame) -> Dict: """ calculate weights for load balancer for each service, weights are proportional to precision score but not less than MIN_WEIGHT, sum of weights for all services approximately equal to WEIGHT_SCALE if service use replication, weight will be divided by replicas number :param metrics: dict with model as key and p as value :param services: dataframe with active services (model, n_replicas, service) :return: dict with service as a key and weight as value """ services = services.merge(pd.DataFrame(metrics), on='model') services['weight'] = (services['p'] / services['p'].sum() * WEIGHT_SCALE / services['replicas']).fillna(0) services['weight'] = services['weight'].astype(int) services.loc[services['weight'] == 0, 'weight'] = MIN_WEIGHT return services[['service', 'weight']].set_index('service').to_dict()['weight']