Python DataFrame.merge Examples, pandas.DataFrame.merge Python Examples

Example #1

0

Show file

File: Unsaleables_V5.py Project: paulmattheww/Original-Projects

def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns

Example #2

0

Show file

File: test_multi.py Project: DusanMilunovic/pandas

    def test_merge_datetime_index(self, box):
        # see gh-19038
        df = DataFrame([1, 2, 3],
                       ["2016-01-01", "2017-01-01", "2018-01-01"],
                       columns=["a"])
        df.index = pd.to_datetime(df.index)
        on_vector = df.index.year

        if box is not None:
            on_vector = box(on_vector)

        expected = DataFrame(
            OrderedDict([
                ("a", [1, 2, 3]),
                ("key_1", [2016, 2017, 2018]),
            ])
        )

        result = df.merge(df, on=["a", on_vector], how="inner")
        tm.assert_frame_equal(result, expected)

        expected = DataFrame(
            OrderedDict([
                ("key_0", [2016, 2017, 2018]),
                ("a_x", [1, 2, 3]),
                ("a_y", [1, 2, 3]),
            ])
        )

        result = df.merge(df, on=[df.index.year], how="inner")
        tm.assert_frame_equal(result, expected)

Example #3

0

Show file

File: test_multi.py Project: DusanMilunovic/pandas

    def test_merge_na_keys(self):
        data = [[1950, "A", 1.5],
                [1950, "B", 1.5],
                [1955, "B", 1.5],
                [1960, "B", np.nan],
                [1970, "B", 4.],
                [1950, "C", 4.],
                [1960, "C", np.nan],
                [1965, "C", 3.],
                [1970, "C", 4.]]

        frame = DataFrame(data, columns=["year", "panel", "data"])

        other_data = [[1960, 'A', np.nan],
                      [1970, 'A', np.nan],
                      [1955, 'A', np.nan],
                      [1965, 'A', np.nan],
                      [1965, 'B', np.nan],
                      [1955, 'C', np.nan]]
        other = DataFrame(other_data, columns=['year', 'panel', 'data'])

        result = frame.merge(other, how='outer')

        expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
        expected = expected.replace(-999, np.nan)

        tm.assert_frame_equal(result, expected)

Example #4

0

Show file

File: bandits.py Project: uSasha/pydata_demo

    def calculate_weights(self, services: pd.DataFrame) -> Dict:
        """
        calculate weights for load balancer for each service,
        weights are proportional to precision score but not less than MIN_WEIGHT,
        sum of weights for all services approximately equal to WEIGHT_SCALE

        if service use replication, weight will be divided by replicas number

        :param services: dataframe with active services (block, model, n_replicas, service)
        :return: dict with service as a key and weight as value
        """
        start_time = int(time.time() * 1000) - TIME_WINDOW

        for block in services['block'].unique():
            if block:
                score = self.r.xrange(block, min=start_time, max='+')

        score = pd.DataFrame([x for _, x in score])
        score.columns = [c.decode() for c in score.columns]
        score['value'] = score['value'].astype(int)
        score['model'] = score['model'].apply(lambda x: x.decode())
        score = score.groupby('model', as_index=False).mean()
        score = services.merge(score, on='model')
        score['weight'] = (score['value']
                           / score['value'].sum()
                           * WEIGHT_SCALE
                           / score['replicas']
                           ).fillna(0)

        score['weight'] = score['weight'].astype(int)
        score.loc[score['weight'] == 0, 'weight'] = MIN_WEIGHT

        return score.set_index('service').to_dict()['weight']

Example #5

0

Show file

File: PairCointegration_v1.py Project: vdaytona/Quant-Ver1

 def alignTime(self,instruments):
     i = 0
     mergedSeries = None
     for instrument in instruments :
         if i == 0:
             # first instrument do not merge
             i += 1
             # change col name in order not to overlap
             mergedSeries = instruments[instrument][["Date","AdjClose"]]
             newColName = str("%s_Adjclose" %instrument)
             mergedSeries = mergedSeries.rename(columns={'AdjClose': newColName})
         else :
             newSeries = instruments[instrument][["Date","AdjClose"]]
             newColName = str("%s_Adjclose" %instrument)
             newSeries = newSeries.rename(columns={'AdjClose': newColName})
             
             mergedSeries = pd.merge(mergedSeries,newSeries, on="Date", how = "inner")
     mergedSeries.dropna()
     # put result into Dict, and recover Name
     alignedSeries = dict()
     for instrument in instruments :
         colName = str("%s_Adjclose" %instrument)
         alignedSeries[instrument] = mergedSeries[["Date",colName]].rename(columns={colName : 'AdjClose'})
     mergedSeries = mergedSeries.set_index(mergedSeries["Date"].values)
     #mergedSeries.plot()
     #plt.show()
     return alignedSeries

Example #6

0

Show file

File: cache_meta.py Project: vhpgomes/rhizome

    def main(self):
        '''
        Find the office id for each indicator.
        '''

        ## find the data for the indicators requested ##
        df = DataFrame(list(DataPointComputed.objects.filter(indicator_id__in=\
            self.indicator_id_list).values_list('indicator_id','campaign_id')\
            .distinct()),columns=['indicator_id','campaign_id'])

        ## find all campaigns + office combinations
        office_lookup_df = DataFrame(list(Campaign.objects.all()\
            .values_list('id','office_id')),columns=['campaign_id','office_id'])

        ## Join the two dataframes and take the distinct office, indicator ##
        joined_df = df.merge(office_lookup_df)
        unique_df = joined_df[['indicator_id','office_id']].drop_duplicates()

        ## iterrate throught the DF, create objects and prep for bulk_create
        ind_to_office_batch = []
        for ix, data in unique_df.iterrows():
            ind_to_office_batch.append(IndicatorToOffice(**data.to_dict()))

        ## delete then re-insert  ##
        IndicatorToOffice.objects.filter(indicator_id__in = \
            self.indicator_id_list).delete()
        IndicatorToOffice.objects.bulk_create(ind_to_office_batch)

Example #7

0

Show file

File: cache_meta.py Project: unicef/rhizome

    def process_location_tree_lvl(self, location_type_id):
        '''
        Get and process data for a particular location type ( admin level ).
        '''

        lt_batch = []
        df_columns = ['location_id', 'parent_location_id']
        location_df = DataFrame(list(Location.objects
                             .filter(location_type_id=location_type_id)
                             .values_list('id', 'parent_location_id')),\
                              columns= df_columns)
        location_df['lvl'] = 1 # since this is a direct parent child relation

        merged_df = location_df.merge(self.location_tree_df\
                        ,left_on='location_id'\
                        ,right_on='parent_location_id')

        cleaned_merge_df = merged_df[['location_id_y', 'parent_location_id_x'\
            ,'lvl_y']]

        cleaned_merge_df['lvl_y'] = cleaned_merge_df['lvl_y'] + 1
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df, location_df,
                                        cleaned_merge_df])
        self.location_tree_df.drop_duplicates()

Example #8

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_dups(self):

        # joining dups
        df = concat([DataFrame(np.random.randn(10, 4),
                               columns=['A', 'A', 'B', 'B']),
                     DataFrame(np.random.randint(0, 10, size=20)
                               .reshape(10, 2),
                               columns=['A', 'C'])],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True, right_index=True).merge(
            z, left_index=True, right_index=True, how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = ['x_x', 'y_x', 'x_y',
                            'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
        assert_frame_equal(dta, expected)

Example #9

0

Show file

File: Unsaleables_V5.py Project: paulmattheww/Original-Projects

def create_summaries(unsaleables_by_product, pw_ytdsupp):
    '''
    Creates useful one-look summaries for management.
    '''
    print('*'*100)
    print('Creating summaries.')
    print('*'*100)    
    
    summary_cols = ['DollarsUnsaleable|sum', 'DollarsReturned|sum', 
                    'CasesUnsaleable|sum', 'CasesReturned|sum']
    
    print('\n\n\nSummarizing Directors.')
    by_director = DataFrame(unsaleables_by_product.groupby('Director')[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False)
    
    print('Summarizing Suppliers.')
    by_supplier = DataFrame(unsaleables_by_product.groupby(['Director','SupplierId','Supplier'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False).reset_index(level=['Director','SupplierId','Supplier'], drop=False)

    print('Merging in YTD sales by supplier and deriving percent of sales.')    
    by_supplier = by_supplier.merge(pw_ytdsupp, on='SupplierId', how='left')
    by_supplier['PercentSales'] = np.divide(by_supplier['DollarsUnsaleable|sum'], by_supplier['DollarSales|bysupplier'])
    
    print('Summarizing by Class.\n\n\n')
    by_class = DataFrame(unsaleables_by_product.groupby(['Class'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False)

    print('*'*100)
    print('Finished creating summaries.')   
    print('*'*100)
    
    return by_supplier, by_director, by_class

Example #10

0

Show file

File: datapoint.py Project: zhangsichu/rhizome

    def group_by_time_transform(self):
        dp_df_columns = ['data_date','indicator_id','location_id','value']
        time_grouping =  self.parsed_params['group_by_time']

        # HACKK
        if self.parsed_params['chart_uuid'] ==\
            '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be':

            self.parsed_params['show_missing_data'] = 1
            return self.handle_polio_case_table(dp_df_columns)

        cols = ['data_date','indicator_id','location_id','value']
        dp_df = DataFrame(list(DataPoint.objects.filter(
            location_id__in = self.location_ids,
            indicator_id__in = self.parsed_params['indicator__in']
        ).values(*cols)),columns=cols)

        if not dp_df.empty:
            dp_df = self.get_time_group_series(dp_df)
            gb_df = DataFrame(dp_df\
                .groupby(['indicator_id','time_grouping','location_id'])['value']\
                .sum())\
                .reset_index()
            return gb_df
        # need to look at sublocations if the data isn't available at the current level
        else:
            depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids
            while dp_df.empty and depth_level < max_depth:
                sub_location_ids = Location.objects\
                    .filter(parent_location_id__in=sub_location_ids)\
                    .values_list('id', flat=True)

                dp_df = DataFrame(list(DataPoint.objects.filter(
                    location_id__in = sub_location_ids,
                    indicator_id__in = self.parsed_params['indicator__in']
                ).values(*cols)),columns=cols)
                depth_level += 1

            dp_df = self.get_time_group_series(dp_df)
            if dp_df.empty:
                return []
            location_tree_df = DataFrame(list(LocationTree.objects\
                .filter(location_id__in = sub_location_ids)\
                .values_list('location_id','parent_location_id')),\
                    columns=['location_id','parent_location_id'])

            merged_df = dp_df.merge(location_tree_df)
            filtered_df = merged_df[merged_df['parent_location_id']\
                .isin(self.location_ids)]

            gb_df = DataFrame(filtered_df\
                .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\
                .sum())\
                .reset_index()

            gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'})
            return gb_df

Example #11

0

Show file

File: Unsaleables_V5.py Project: paulmattheww/Original-Projects

def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust):
    '''
    Derives intelligence out of MTC1 data 
    on customer returns. 
    '''
    print('*'*100)
    print('Creating summary of returns.')
    print('*'*100)
    
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    print('\n\n\nAggregating tidy dataset.')
    customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)
    
    print('Reorder columns for readability.\n\n\n')
    reorder_cols = ['CustomerId','Customer','Returns|count',
                    'PercentSales','DollarSales|bycustomer',
                    'DollarsReturned|sum','DollarsReturned|avg',
                    'CasesReturned|sum','CasesReturned|avg',
                    'OnPremise','Latitude','Longitude']
    customer_returns = customer_returns[reorder_cols]
    
    print('*'*100)
    print('Finished summarizing returns.')
    print('*'*100)
    
    return customer_returns

Example #12

0

Show file

File: solrload.py Project: nandan-pc/polardata-analysis-2

def get_merged_data():
	
	with open("resources/data/ner.json") as f:
		ner = DataFrame(json.load(f))
	with open("resources/data/publications.json") as f:
		publications = DataFrame(json.load(f))
	with open("resources/data/sweet.json") as f:
		sweet_features = DataFrame(json.load(f))

	return (ner.merge(publications,on=["id"])).merge(sweet_features,on=["id"]).T.to_dict().values()

Example #13

0

Show file

File: solrload.py Project: durgaravi/polardata-analysis-2

def get_merged_data(solrdata):
	
	with open("ner.json") as f:
		ner = DataFrame(json.load(f))
	with open("publications.json") as f:
		publications = DataFrame(json.load(f))
	with open("geotopic.json") as f:
		sweet_features = DataFrame(json.load(f))

	return (ner.merge(publications)).merge(sweet_features).T.to_dict().values()

Example #14

0

Show file

File: preprocessing.py Project: klainfo/mimic3-benchmarks

def assemble_episodic_data(stays, diagnoses):
    data = { 'Icustay': stays.ICUSTAY_ID, 'Age': stays.AGE, 'Length of Stay': stays.LOS,
                    'Mortality': stays.MORTALITY }
    data.update(transform_gender(stays.GENDER))
    data.update(transform_ethnicity(stays.ETHNICITY))
    data['Height'] = np.nan
    data['Weight'] = np.nan
    data = DataFrame(data).set_index('Icustay')
    data = data[['Ethnicity', 'Gender', 'Age', 'Height', 'Weight', 'Length of Stay', 'Mortality']]
    return data.merge(extract_diagnosis_labels(diagnoses), left_index=True, right_index=True)

Example #15

0

Show file

File: geodataframe.py Project: michaelaye/geopandas

 def merge(self, *args, **kwargs):
     result = DataFrame.merge(self, *args, **kwargs)
     geo_col = self._geometry_column_name
     if isinstance(result, DataFrame) and geo_col in result:
         result.__class__ = GeoDataFrame
         result.crs = self.crs
         result._geometry_column_name = geo_col
         result._invalidate_sindex()
     elif isinstance(result, DataFrame) and geo_col not in result:
         result.__class__ = DataFrame
     return result

Example #16

0

Show file

File: monitoring.py Project: petercerno/monitoring

 def get_result(self):
     """
     Get the result after processing the work log.
     """
     if (self._data is None) or (len(self._data) == 0) or\
        (self._proj is None) or (len(self._proj) == 0):
         return (None, None, None)
     task_frame = DataFrame(self._data, 
         columns = ['line', 'date', 'hours', 'task'])
     proj_frame = DataFrame(self._proj, 
         columns = ['line', 'date', 'task', 'project'])
     # Group projects assigned to tasks
     task_projs = (task_frame[['task']])\
         .merge(proj_frame[['task', 'project']], how='outer', on='task')\
         .drop_duplicates(['task', 'project'])\
         .groupby('task').project
     # Filter tasks without projects
     task_frame = task_frame.merge(proj_frame[['task']]\
         .drop_duplicates('task'))
     # Construct task table
     task_table = (task_frame[['task', 'date', 'hours']])\
         .groupby(['task', 'date']).sum()\
         .unstack()['hours'].fillna(0.0)
     # Assign tasks to projects
     join_frame = task_frame.merge(proj_frame, 
         how='inner', on='task', suffixes=['_task', '_proj'])
     join_frame = \
         (join_frame[join_frame.line_task <= join_frame.line_proj])\
         .sort(columns=['line_task', 'line_proj'])\
         .drop_duplicates('line_task')
     # Construct project table
     proj_table = (join_frame[['date_task', 'project', 'hours']])\
         .groupby(['project', 'date_task']).sum()\
         .unstack()['hours'].fillna(0.0)
     proj_table.columns.name = 'date'
     # Set totals
     task_table['TOTAL'] = task_table.sum(axis=1)
     proj_table['TOTAL'] = proj_table.sum(axis=1)
     task_table.ix['TOTAL'] = task_table.sum()
     proj_table.ix['TOTAL'] = proj_table.sum()
     return (task_table, proj_table, task_projs)

Example #17

0

Show file

File: _pandas.py Project: henriquejensen/mining

class CubeJoin(object):
    def __init__(self, cube):
        self.cube = cube
        self.data = DataFrame({})
        method = getattr(self, cube.get('cube_join_type', 'none'))
        method()

    def inner(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        DW = DataWarehouse()
        for i, rel in enumerate(self.cube.get('relationship')):
            data = DW.get(rel['cube']).get('data')
            df = DataFrame(data)
            if i == 0:
                self.data = df
            else:
                self.data = self.data.merge(df, how='inner', on=fields[0])
        return self.data

    def left(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        self.data = DataFrame({fields[0]: []})
        DW = DataWarehouse()
        for rel in self.cube.get('relationship'):
            data = DW.get(rel['cube'])
            self.data = self.data.merge(DataFrame(data.get('data')),
                how='outer', on=fields[0])
        return self.data

    def append(self):
        self.data = DataFrame({})
        DW = DataWarehouse()
        self.data.append([DataFrame(
                          DW.get(rel['cube']).get('data'))
                          for rel in self.cube.get('relationship')],
                         ignore_index=True)
        return self.data

    def none(self):
        return self.data

Example #18

0

Show file

File: questions_variables.py Project: paneldata/soep-core

def create_indirect_links_once(df: pd.DataFrame) -> pd.DataFrame:
    """ This function gets a Dataframe as input.

        The function then merges the Dataframe with itself on given keys.
        The function returns the Dataframe with newly added lines that result from indirect links.
    """

    # merge the Dataframe with itself based on keys of input study etc. and output study.
    # two rows match if the contents of the left side match the contents of the right side.

    # row 1
    # input_study, input_dataset, input_version, input_variable
    # 1, 1, 1, 1

    # matches row 2
    # output_study, output_dataset, output_version, output_variable
    # 1, 1, 1, 1

    temp = df.merge(
        df,
        right_on=["input_study", "input_dataset", "input_version", "input_variable"],
        left_on=["output_study", "output_dataset", "output_version", "output_variable"],
    )
    WANTED_COLUMNS = [
        "input_study_x",
        "input_dataset_x",
        "input_version_x",
        "input_variable_x",
        "output_study_y",
        "output_dataset_y",
        "output_version_y",
        "output_variable_y",
    ]
    # select only the columns for
    # input study etc. from the left Dataframe and the output study etc. from the right Dataframe
    temp = temp[WANTED_COLUMNS]

    # Rename the rows to be of the original format
    RENAME_COLUMNS = {
        "input_study_x": "input_study",
        "input_dataset_x": "input_dataset",
        "input_version_x": "input_version",
        "input_variable_x": "input_variable",
        "output_study_y": "output_study",
        "output_dataset_y": "output_dataset",
        "output_version_y": "output_version",
        "output_variable_y": "output_variable",
    }
    temp.rename(columns=RENAME_COLUMNS, inplace=True)

    # add new rows to the original Dataframe, dropping duplicates
    return df.append(temp).drop_duplicates().reset_index(drop=True)

Example #19

0

Show file

File: test_join.py Project: johnnychiuchiu/pandas

    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)

Example #20

0

Show file

File: test_api_geo.py Project: unicef/rhizome

    def setUp(self):
        super(GeoResourceTest, self).setUp()

        self.ts = TestSetupHelpers()
        self.lt = LocationType.objects.create(name='Region',admin_level=1)

        self.distr, created = \
            LocationType.objects.get_or_create(name='District',admin_level = 2)

        self.planet_location_type = LocationType.objects\
            .create(name = 'Planet', admin_level = 0)

        self.ultimate_parent = Location.objects.create(
            id = 1,
            name = 'Earth',
            location_code = 'Earth',
            location_type_id = self.planet_location_type.id
        )

        location_df_from_csv= read_csv('rhizome/tests/_data/locations_nimroz.csv')
        self.ts.model_df_to_data(location_df_from_csv,Location)

        # make sure that the proper level is set for the
        locs = Location.objects.filter(parent_location_id=6)
        for loc in locs:
            loc.location_type_id = self.distr.id
            loc.save()

        parent = Location.objects.get(id=6)
        parent.location_type_id = self.lt.id
        parent.save()

        geo_json_df = read_csv('rhizome/tests/_data/geo_json_small.txt',delimiter = "|")
        location_df = DataFrame(list(Location.objects.all()\
		    .values_list('id','location_code')),columns=['location_id','location_code'])
        location_tree_df = DataFrame(list(Location.objects.all()\
		    .values_list('id','parent_location_id'))\
            ,columns=['location_id','parent_location_id'])

        location_tree_df['parent_location_id'].fillna(self.ultimate_parent.id,\
            inplace=True)

        location_tree_df['lvl'] = Series(1, index=location_tree_df.index)
        self.ts.model_df_to_data(location_tree_df, LocationTree)
        merged_df = location_df.merge(geo_json_df)[['location_id','geo_json']]
        self.ts.model_df_to_data(merged_df, LocationPolygon)
        minify_geo_json()

        LocationPermission.objects.create(user_id = self.ts.user.id,
            top_lvl_location_id = 1)

Example #21

0

Show file

File: test_generic.py Project: zachcp/pandas

    def test_metadata_propagation_indiv(self):

        # groupby
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        result = df.groupby('A').sum()
        self.check_metadata(df,result)

        # resample
        df = DataFrame(np.random.randn(1000,2),
                       index=date_range('20130101',periods=1000,freq='s'))
        result = df.resample('1T')
        self.check_metadata(df,result)

        # merging with override
        # GH 6923
        _metadata = DataFrame._metadata
        _finalize = DataFrame.__finalize__

        np.random.seed(10)
        df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b'])
        df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd'])
        DataFrame._metadata = ['filename']
        df1.filename = 'fname1.csv'
        df2.filename = 'fname2.csv'

        def finalize(self, other, method=None, **kwargs):

            for name in self._metadata:
                if method == 'merge':
                    left, right = other.left, other.right
                    value = getattr(left, name, '') + '|' + getattr(right, name, '')
                    object.__setattr__(self, name, value)
                else:
                    object.__setattr__(self, name, getattr(other, name, ''))

            return self

        DataFrame.__finalize__ = finalize
        result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner')
        self.assertEquals(result.filename,'fname1.csv|fname2.csv')

        DataFrame._metadata = _metadata
        DataFrame.__finalize__ = _finalize

Example #22

0

Show file

File: general_helper_funcs.py Project: fanraul/RichMinds

def dfm_A_intersect_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame:
    """
    A - B return the entries which in A and in B based on key cols, it is mainly used to identify duplicate entries and
    then update to DB
    :param A:
    :param B:
    :return:
    """
    if len(B) == 0:
        return DataFrame(columns=list(A.columns))
    B_tmp=B[key_cols].copy()
    B_tmp['tmp_col_duplicated'] = 'Y'
    dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols)
    # dfm_merge_by_keycols.dropna() 这里不能使用dropna,dropna只要这行中有None值,就会删除.会导致误删除.
    dfm_merge_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated == 'Y']
    del dfm_merge_by_keycols['tmp_col_duplicated']
    return dfm_merge_by_keycols

Example #23

0

Show file

File: 0004_populate_fake_computed_data.py Project: mepoole/rhizome

def populate_fake_dwc_data(apps, schema_editor):
    '''
    This migration will be removed, and we will prefer the "initial_meta_data"
    ingetion and rely on DocTransform, RefreshMaster and AggRefresh in order
    to populate the datapoint_with_computed table.. however, so that we can have
    ample data to show on the dashboards, i will take the cartesion product
    of campaigns, indicators and selected locations ( provinces and LPDS )
    and dump that data in to datapoint_with_computed.

    It would be nice to somehow set this up so that when a new developer spins
    up the app locally.. they can populate this 'fake' data.

    Maybe somethign like.. if SETTINGS.debug = True, then ingest fake data.
    '''

    ind_df = DataFrame(list(Indicator.objects.all()\
        .values_list('id','short_name','data_format')),columns = ['indicator_id','short_name','data_format'])

    campaign_df = DataFrame(list(Campaign.objects.all()\
        .values_list('id','name')),columns = ['campaign_id','campaign_name'])

    country_id_list = list(Location.objects\
        .filter(location_type_id = 1)\
        .values_list('id',flat=True))

    lpd_id_qs = list(Location.objects\
        .filter(lpd_status__in=[1,2])\
        .values_list('id','parent_location_id'))

    province_id_list = [y for x, y in lpd_id_qs]
    lpd_id_list = [x for x, y in lpd_id_qs]

    location_ids = country_id_list + province_id_list + lpd_id_list

    location_df = DataFrame(list(Location.objects\
        .filter(id__in=location_ids)\
        .values_list('id','name')),columns = ['location_id','name'])

    ind_df['join_col'] = 1
    campaign_df['join_col'] = 1
    location_df['join_col'] = 1

    first_merged_df = ind_df.merge(campaign_df,on='join_col')
    final_merged_df = first_merged_df.merge(location_df, on='join_col')

    upsert_df_data(final_merged_df)

Example #24

0

Show file

File: main.py Project: rodrigoburg/voto_distrital

def junta_tabelas():
    #locais = quebra_secoes()
    votos = arruma_votos()
    print(votos)

    locais = read_csv("locais_com_votacao_trabalhada.csv")
    #votos = read_csv("voto_secao_partido_trabalhada.csv")
    saida = DataFrame.merge(locais,votos, left_on="id",right_on="id",how="outer")

    saida = DataFrame(saida.groupby(["lat","long","aptos_por_local","local_de_votacao","zona_eleitoral_nro","bairro","endereco","secoes_eleitorais","zona_eleitoral_nome"]).sum().reset_index())
    saida = saida.fillna(0)
    saida = saida[saida.secao != 0]
    saida["lat_real"] = saida["long"]
    saida["long"] = saida["lat"]
    saida["lat"] = saida["lat_real"]
    del saida["lat_real"]

    saida.to_csv("secoes_com_votacao.csv",index=False)

Example #25

0

Show file

File: cache_meta.py Project: vhpgomes/rhizome

    def process_location_tree_lvl(self, location_type_id):

        lt_batch = []

        location_df = DataFrame(list(Location.objects\
            .filter(location_type_id = location_type_id)\
            .values_list('id','parent_location_id')),columns=self.location_tree_columns)

        merged_df = location_df.merge(self.location_tree_df
            ,left_on='location_id',right_on='parent_location_id')

        cleaned_merge_df = merged_df[['location_id_y','parent_location_id_x']]
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df,location_df,\
            cleaned_merge_df])

        self.location_tree_df.drop_duplicates()

Example #26

0

Show file

File: refresh_master.py Project: cotsog/rhizome

    def mark_datapoints_with_needs_campaign(self):

        new_dp_df = DataFrame(list(DataPoint.objects\
            .filter(source_submission_id__in = \
                self.ss_ids_to_process).values()))

        date_series = new_dp_df['data_date']
        mn_date, mx_date = min(date_series).date(), max(date_series).date()

        office_lookup_df = DataFrame(list(Location.objects\
            .filter(id__in = list(set(new_dp_df['location_id'])))\
            .values_list('id','office_id')), \
             columns = ['location_id', 'office_id'])

        campaign_qs = Campaign.objects.filter(
            end_date__gte = mn_date, start_date__lte = mx_date,
            office_id__in = office_lookup_df\
            ['office_id'].unique())

        campaign_df = DataFrame(list(campaign_qs\
            .values('office_id','start_date','end_date')))

        if len(campaign_df) == 0:
            ## no campaigns match the datapoitns so update all with cj_id = -2
            DataPoint.objects.filter(id__in=new_dp_df['id'].unique())\
                .update(cache_job_id = -2)
            return

        dp_merged_df = new_dp_df.merge(office_lookup_df)
        cleaned_dp_df = dp_merged_df[['id','office_id','data_date']]

        dp_ids_that_need_campaign = []
        dp_merged_with_campaign = cleaned_dp_df.merge(campaign_df)

        ## iterrate over the dps and check if there is a campaign ##
        for ix, r in dp_merged_with_campaign.iterrows():
            ## convert date time to date
            r_date = r.data_date.date()
            if r_date >= r.end_date or r_date < r.start_date:
                dp_ids_that_need_campaign.append(r.id)

        DataPoint.objects.filter(id__in=dp_ids_that_need_campaign)\
            .update(cache_job_id = -2)

Example #27

0

Show file

File: general_helper_funcs.py Project: fanraul/RichMinds

def dfm_A_minus_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame:
    """
    A - B return the entries which in A but not in B based on key cols, it is mainly used to remove duplicate entries and
    then insert to DB
    :param A:
    :param B:
    :return:
    """
    if len(B) == 0:
        return A
    # dfmprint(A[0:10])
    # dfmprint(B[0:10])
    B_tmp=B[key_cols].copy()
    B_tmp['tmp_col_duplicated'] = 'Y'
    dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols)
    dfm_merge_by_keycols.fillna({'tmp_col_duplicated':'N'},inplace = True)
    dfm_dif_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated.isin(['N'])]
    del dfm_dif_by_keycols['tmp_col_duplicated']
    return dfm_dif_by_keycols

Example #28

0

Show file

File: _pandas.py Project: pengjia/mining

class CubeJoin(object):
    def __init__(self, cube):
        self.cube = cube
        self.data = DataFrame({})

        MyClient = riak.RiakClient(
            protocol=conf("riak")["protocol"],
            http_port=conf("riak")["http_port"],
            host=conf("riak")["host"])

        self.MyBucket = MyClient.bucket(conf("riak")["bucket"])
        self.MyBucket.enable_search()

        method = getattr(self, cube.get('cube_join_type', 'none'))
        method()

    def inner(self):
        fields = set([rel['field'] for rel in self.cube.get('relationship')])
        self.data = concat([DataFrame(self.MyBucket.get(rel['cube']).data)
                            for rel in self.cube.get('relationship')],
                           keys=fields, join='inner', ignore_index=True,
                           axis=1)
        return self.data

    def left(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        self.data = DataFrame({fields[0]: []})
        for rel in self.cube.get('relationship'):
            self.data = self.data.merge(DataFrame(
                self.MyBucket.get(rel['cube']).data),
                how='outer', on=fields[0])
        return self.data

    def append(self):
        self.data = DataFrame({})
        self.data.append([DataFrame(self.MyBucket.get(rel['cube']).data)
                          for rel in self.cube.get('relationship')],
                         ignore_index=True)
        return self.data

    def none(self):
        return self.data

Example #29

0

Show file

File: datapoint.py Project: zhangsichu/rhizome

    def base_transform(self):
        results = []

        df_columns = ['id', 'indicator_id', 'campaign_id', 'location_id',\
            'value']
        computed_datapoints = DataPointComputed.objects.filter(
                campaign__in=self.parsed_params['campaign__in'],
                location__in=self.location_ids,
                indicator__in=self.parsed_params['indicator__in'])

        dwc_df = DataFrame(list(computed_datapoints.values_list(*df_columns)),\
            columns=df_columns)

        # do an inner join on the filter indicator
        if self.parsed_params['filter_indicator'] and self.parsed_params['filter_value']:
            merge_columns = ['campaign_id', 'location_id']
            indicator_id = Indicator.objects.get(short_name = self.parsed_params['filter_indicator'])
            filter_value_list = [self.parsed_params['filter_value']]

            if filter_value_list == ['-1']: ## this means "show all classes"
                filter_value_list = [1,2,3]
                ## this only works for LPDS... this should be --
                ## IndicatorClassMap.objects.filter(indicator = indicator)\
                ##    .values_list(enum_value, flat = True)

            filter_datapoints = DataPointComputed.objects.filter(
                campaign__in=self.parsed_params['campaign__in'],
                location__in=self.location_ids,
                indicator_id=indicator_id,
                value__in = filter_value_list
                )
            filter_df =DataFrame(list(filter_datapoints.values_list(*merge_columns)),\
            columns=merge_columns)
            dwc_df = dwc_df.merge(filter_df, how='inner', on=merge_columns)

            ## now only show the locations that match that filter..
            location_ids_in_filter = set(filter_df['location_id'])
            self.location_ids = set(self.location_ids)\
                .intersection(location_ids_in_filter)

        dwc_df = dwc_df.apply(self.add_class_indicator_val, axis=1)
        return dwc_df

Example #30

0

Show file

File: data_filtering.py Project: howardcheung/weather-porcessing

def shift_data(filtereddf: pd.DataFrame, locdf: pd.DataFrame,
               filename: str) -> pd.DataFrame:
    """
        This function merges the filtered weather data pandas DataFrame and
        the location dataframe, and shift the data for 6 months if the
        location is in the southern Hemisphere. Return the merged and shift
        pandas DataFrame

        Inputs:
        ==========
        filtereddf: pandas DataFrame
            filtereddf from datafiltering()

        locdf: pandas DataFrame
            location datafrane from read_history()

        filename: str
            name of the file to be stored
    """

    # merge the dataframe
    overall_df = filtereddf.merge(locdf, how='inner', on='stn')

    # find the data with latitude < 0
    south_ind = overall_df['LAT'] < 0.0

    # shift the series
    for ind in range(1, 7):
        for txt in ['tmp', 'dew', 'stp', 'wpd', 'prec', 'sndp']:
            for suffix in ['mean', 'max', 'min']:
                now_mn = ''.join([txt, '%02i' % ind, suffix])
                fut_mn = ''.join([txt, '%02i' % (ind + 6), suffix])
                temp_series = overall_df.loc[south_ind, now_mn]
                overall_df.loc[south_ind, now_mn] = overall_df.loc[south_ind,
                                                                   fut_mn]
                overall_df.loc[south_ind, fut_mn] = temp_series

    # save the data
    overall_df.to_csv(filename)
    return overall_df

Example #31

0

Show file

def add_within_category_edges(nodes: pd.DataFrame, edges: pd.DataFrame):
    # make edges among nodes of same category
    self_join = nodes.merge(nodes, on="category")
    self_join = self_join[self_join.id_x.ne(self_join.id_y)]
    one_edge_per_category = self_join.groupby("id_x").first().reset_index()

    # append them to actual edges after reshaping
    within_category_edges = one_edge_per_category[["id_x", "id_y", "category"]]
    within_category_edges.rename(
        columns={
            "id_x": "from",
            "id_y": "to",
            "category": "category_source"
        },
        inplace=True,
    )
    within_category_edges["category_target"] = within_category_edges[
        "category_source"]
    within_category_edges["edge_count"] = 1
    within_category_edges["ppmi"] = 0.5

    return edges.append(within_category_edges)

Example #32

0

Show file

def get_clade(sample_variants: pd.DataFrame, clades: pd.DataFrame) -> str:
    # special case: wildtype (should have . as ref in all its "mutations")
    # so, if we don't find any mutation in the same positions, we assume it's wildtype
    wt = clades.query("ref == '.'")
    wt_name = wt.clade.unique()[0]
    if len(wt.merge(sample_variants, on=["pos"])) == 0:
        return wt_name

    # count how many mutations each clade has
    clade_nmutations = (clades.query("clade != @wt_name").groupby(
        "clade").size().to_frame("n").reset_index())

    selected_clade = ("None", 0)
    for t in clade_nmutations.itertuples():
        name = t.clade
        matched = len(sample_variants.merge(clades.query("clade == @name")))
        # if we find all the mutations of this clade and the number of mutations
        # is higher than the current selected clade, choose this clade
        if matched == t.n and t.n > selected_clade[1]:
            selected_clade = (name, t.n)

    return selected_clade[0]

Example #33

0

Show file

 def _fill_hist_columns(self, ob_df: pd.DataFrame) -> pd.DataFrame:
     if len(self.known_observations_data_frame) > 0:
         ob_df = ob_df.drop(
             columns=[
                 self.project_config.hist_view_column_name,
                 self.project_config.hist_output_column_name,
             ],
             errors="ignore",
         )
         ob_df = ob_df.merge(
             self.hist_data_frame,
             how="left",
             left_on=[
                 self.project_config.user_column.name,
                 self.project_config.item_column.name,
             ],
             right_index=True,
         ).fillna(0)
     else:
         ob_df[self.project_config.hist_view_column_name] = 0
         ob_df[self.project_config.hist_output_column_name] = 0
     return ob_df

Example #34

0

Show file

def add_department_info(
    data: pd.DataFrame,
    left_on: str = "dept_name",
    right_on: str = "alias",
    match_missing: bool = True,
) -> pd.DataFrame:
    """
    Add department info to the input data.

    Parameters
    ----------
    data :
        The input dataframe.
    left_on :
        The column in the input data to merge on
    right_on :
        The column in the dept info data to merge on
    match_missing :
        Whether to attempt to match missing departments.
    """
    # Load the department info with aliases and subitems
    dept_info = load_city_departments(include_aliases=True,
                                      include_line_items=True)

    # Merge into the info
    data = data.merge(
        dept_info,
        left_on=left_on,
        right_on=right_on,
        how="left",
        validate="1:1",
        suffixes=("_raw", ""),
    )

    # Match missing departments
    if match_missing:
        data = match_missing_departments(data)

    return data

Example #35

0

Show file

 def pipe_vaccinations_csv(self, df: pd.DataFrame,
                           df_iso: pd.DataFrame) -> pd.DataFrame:
     return (df.merge(df_iso, on="location").rename(
         columns={
             "new_vaccinations_smoothed": "daily_vaccinations",
             "new_vaccinations_smoothed_per_million":
             "daily_vaccinations_per_million",
             "new_vaccinations": "daily_vaccinations_raw",
         })[[
             'location',
             'iso_code',
             'date',
             'total_vaccinations',
             'people_vaccinated',
             'people_fully_vaccinated',
             'daily_vaccinations_raw',
             'daily_vaccinations',
             'total_vaccinations_per_hundred',
             'people_vaccinated_per_hundred',
             'people_fully_vaccinated_per_hundred',
             'daily_vaccinations_per_million',
         ]])

Example #36

0

Show file

File: newyork_times_covid19_data_processing.py Project: harshal2802/ProcessNewYorkTimesCOVID19Data

def combine_data(df_covid: pd.DataFrame, df_population: pd.DataFrame)\
        -> pd.DataFrame:
    """
    Function to combine covid19 data with population data using
    left join on "fips" column

    Explanation: We are applying the left join because we need
    latest population estimate data from df_population. The combined
    dataframe can later be used for generating the stats

    Parameter:
    ---------
    df_covid: pd.DataFrame object with processed New York Times
        COVID-19 Data having columns:
            "fips": string
            "date": datetime64[ns]
            "cases": float
            "deaths": float

    df_population: pd.DataFrame object with Population Estimate
        Data 2019 having columns:
            "fips": string
            "POPESTIMATE2019": float
    Returns:
    -------
    df_combined: pd.DataFrame object with combined data having
        columns:
            "fips": string
            "date": datetime64[ns]
            "cases": float
            "deaths": float
            "POPESTIMATE2019": float
    """
    #
    df_combined = df_covid.merge(df_population, on="fips", how="left")

    feature_list = ["fips", "date", "cases", "deaths", "POPESTIMATE2019"]

    return df_combined[feature_list]

Example #37

0

Show file

File: SVM_models.py Project: reutapel/decisions_predicion

    def predict(self, validation_x: pd.DataFrame, validation_y: pd.Series):
        if 'per_raisha_baseline' in str.lower(self.model_name):
            validation_x = validation_x.merge(self.per_raisha, left_on='raisha', right_index=True)
            validation_x.index = validation_x.sample_id
            predictions = validation_x.predictions
        else:
            validation_x = validation_x[self.features]
            predictions = self.model.predict(validation_x)
        validation_y.name = 'labels'
        predictions = pd.Series(predictions, index=validation_y.index, name='predictions')
        if predictions.dtype == float:  # regression- create bins to measure the F-score
            bin_prediction, bin_test_y = utils.create_bin_columns(predictions, validation_y)
            four_bin_prediction, four_bin_test_y = utils.create_4_bin_columns(predictions, validation_y)
        else:
            bin_prediction, bin_test_y = pd.Series(name='bin_prediction'), pd.Series(name='bin_label')
            four_bin_prediction, four_bin_test_y =\
                pd.Series(name='four_bin_prediction'), pd.Series(name='four_bin_label')

        predictions = pd.DataFrame(predictions).join(validation_y).join(bin_test_y).join(bin_prediction)
        predictions = predictions.join(four_bin_test_y).join(four_bin_prediction)

        return predictions

Example #38

0

Show file

def join_dates(df: pd.DataFrame):
    """
    Функция объединения близких дат в одну.

    :param df: Таблица данных по одному клиенту
    :return: Таблица данных со схлопнутыми значениями
    """
    df = df.sort_values(by=['date'], ascending=[True])
    unique_dates = pd.DataFrame(df['date'].unique(), columns=['date'])

    # Возвращаем оригинальную таблицу, если была всего одна дата
    if len(unique_dates) == 1:
        return df

    unique_dates['new_date'] = unique_dates['date']

    dates_list = list(unique_dates['date'])
    for i in range(0, len(dates_list) - 1, 2):
        if dates_list[i] == dates_list[i + 1] - timedelta(days=1):
            unique_dates.iat[i, 1] = dates_list[i + 1]
        elif dates_list[i] == dates_list[i + 1] - timedelta(days=2):
            unique_dates.iat[i, 1] = dates_list[i + 1]

    new_dates_list = list(unique_dates['new_date'])
    for i in range(len(new_dates_list) - 1, 0, -1):
        if new_dates_list[i] == new_dates_list[i - 1] + timedelta(days=1):
            unique_dates.iat[i, 1] = new_dates_list[i - 1]

    df = df.merge(unique_dates, on='date')

    print(unique_dates)
    print(len(list(unique_dates['date'].unique())))
    print(len(list(unique_dates['new_date'].unique())))

    df['date'] = df['new_date']

    df = df.groupby(['date', 'product', 'client']).sum().reset_index()

    return df

Example #39

0

Show file

def rank_cumulative_change(df: pd.DataFrame, timeframe: Timeframe):
    cum_sum = defaultdict(float)
    # print(df)
    for date in filter(lambda k: k in df.columns, timeframe.all_dates()):
        for code, price_change in df[date].fillna(0.0).iteritems():
            cum_sum[code] += price_change
        rank = pd.Series(cum_sum).rank(method="first", ascending=False)
        df[date] = rank

    all_available_dates = df.columns
    avgs = df.mean(axis=1)  # NB: do this BEFORE adding columns...
    assert len(avgs) == len(df)
    df["x"] = all_available_dates[-1]
    df["y"] = df[all_available_dates[-1]]

    bins = ["top", "bin2", "bin3", "bin4", "bin5", "bottom"]
    average_rank_binned = pd.cut(avgs, len(bins), bins)
    assert len(average_rank_binned) == len(df)
    df["bin"] = average_rank_binned
    df["asx_code"] = df.index
    stock_sector_df = (
        stocks_by_sector()
    )  # make one DB call (cached) rather than lots of round-trips
    # print(stock_sector_df)
    stock_sector_df = stock_sector_df.set_index("asx_code")
    # print(df.index)
    df = df.merge(
        stock_sector_df, left_index=True, right_on="asx_code"
    )  # NB: this merge will lose rows: those that dont have a sector eg. ETF's
    df = pd.melt(
        df,
        id_vars=["asx_code", "bin", "sector_name", "x", "y"],
        var_name="date",
        value_name="rank",
        value_vars=all_available_dates,
    )
    df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
    df["x"] = pd.to_datetime(df["x"], format="%Y-%m-%d")
    return df

Example #40

0

Show file

def get_metadata(moa_df: pd.DataFrame, image_df: pd.DataFrame) -> pd.DataFrame:
    """Merges and preprocesses metadata files.

    Reads the image and moa metadata dataframes, creates the site
    column, merges the metadata and fills missing values with null.

    Returns : pandas.DataFrame
        The processed metadata DataFrame
    """

    image_df["Image_Metadata_Site"] = image_df.Image_FileName_DAPI.transform(
        lambda x: int(re.search("_s[1-4]_", x).group()[2])  # type: ignore
    )
    return (image_df.merge(
        moa_df,
        how="left",
        left_on=[
            "Image_Metadata_Compound",
            "Image_Metadata_Concentration",
        ],
        right_on=["compound", "concentration"],
    ).drop(columns=["compound", "concentration"]).fillna("null"))

Example #41

0

Show file

File: combine_cluster.py Project: SNaveenMathew/AutomatedClustering

def combine_clusters(data,
                     centroids,
                     stages,
                     cluster_combine,
                     cluster_col="cluster"):
    ratio = cluster_combine['ratio']
    for stage in stages.keys():
        if stage in centroids.keys():
            stage_vars = stages[stage]
            for key in ratio.keys():
                if key in stage_vars:
                    col = cluster_col + "_" + stage
                    cols = [key, col]
                    df = DataFrame(centroids[stage][cols])
                    df['dummy'] = 1
                    df = df.merge(df, on=['dummy'], how="outer")
                    df1 = df[df[col + "_x"] != df[col + "_y"]]
                    df = df1[df1[key + "_x"] > df1[key + "_y"]]
                    df['ratio'] = df[key + "_x"] / df[key + "_y"]
                    df1['ratio'] = df1[key + "_x"] / df1[key + "_y"]
                    cols = [col + "_x", 'ratio']
                    df = df[cols].groupby([col + "_x"
                                           ]).min().reset_index(drop=False)
                    df['combine'] = df['ratio'] < ratio[key]
                    df1 = df1.merge(df, on=[col + "_x", "ratio"], how="outer")
                    df1 = df1.drop(["dummy", key + "_y", key + "_x"], axis=1)
                    df1.columns = [
                        sub(pattern="_x$", repl="", string=x)
                        for x in df1.columns.tolist()
                    ]
                    df1[col] = df1[col].apply(str)
                    data[col] = data[col].apply(str)
                    data = data.merge(df1, on=[col], how="outer")
                    condition = data['combine'].apply(type) == float
                    data['combine'][condition] = False
                    data[col][~condition] = data[col + "_y"][~condition]
                    data = data.drop([col + "_y", "combine"], axis=1)

    return data

Example #42

0

Show file

File: 0004_populate_fake_computed_data.py Project: zhangsichu/rhizome

def populate_fake_dwc_data(apps, schema_editor):
    '''
    This migration will be removed, and we will prefer the "initial_meta_data"
    ingetion and rely on DocTransform, RefreshMaster and AggRefresh in order
    to populate the datapoint_with_computed table.. however, so that we can have
    ample data to show on the dashboards, i will take the cartesion product
    of campaigns, indicators and selected locations ( provinces and LPDS )
    and dump that data in to datapoint_with_computed.

    It would be nice to somehow set this up so that when a new developer spins
    up the app locally.. they can populate this 'fake' data.

    Maybe somethign like.. if SETTINGS.debug = True, then ingest fake data.
    '''

    document = Document.objects.create(doc_title='Initial FAKE Data Load')

    ind_df = DataFrame(list(Indicator.objects.all()\
        .values_list('id','short_name','data_format')),columns = ['indicator_id','short_name','data_format'])

    campaign_df = DataFrame(list(Campaign.objects.all()\
        .values_list('id','name')),columns = ['campaign_id','campaign_name'])

    country_id_list = list(Location.objects\
        .filter(location_type_id = 1)\
        .values_list('id',flat=True))

    location_df = DataFrame(list(Location.objects\
        .filter(location_type_id__lte = 3)\
        .values_list('id','name')),columns = ['location_id','name'])

    ind_df['join_col'] = 1
    campaign_df['join_col'] = 1
    location_df['join_col'] = 1

    first_merged_df = ind_df.merge(campaign_df, on='join_col')
    final_merged_df = first_merged_df.merge(location_df, on='join_col')

    upsert_df_data(final_merged_df, document.id)

Example #43

0

Show file

File: generate_dataset.py Project: Cahcn/covid-19-data

 def pipe_capita(self, df: pd.DataFrame) -> pd.DataFrame:
     logger.info("Adding per-capita variables")
     # Get data
     df_subnational = pd.read_csv(
         self.inputs.population_sub, usecols=["location", "population"]
     )
     pop = self.get_population(df_subnational)
     df = df.merge(pop, on="location")
     # Get covered countries
     locations = df.location.unique()
     ncountries = df_subnational.location.tolist() + list(self.aggregates.keys())
     self._countries_covered = list(filter(lambda x: x not in ncountries, locations))
     # Obtain per-capita metrics
     df = df.assign(
         total_vaccinations_per_hundred=(
             df.total_vaccinations * 100 / df.population
         ).round(2),
         people_vaccinated_per_hundred=(
             df.people_vaccinated * 100 / df.population
         ).round(2),
         people_fully_vaccinated_per_hundred=(
             df.people_fully_vaccinated * 100 / df.population
         ).round(2),
         total_boosters_per_hundred=(df.total_boosters * 100 / df.population).round(
             2
         ),
         new_vaccinations_smoothed_per_million=(
             df.new_vaccinations_smoothed * 1000000 / df.population
         ).round(),
     )
     df.loc[:, "people_fully_vaccinated"] = df.people_fully_vaccinated.replace(
         {0: pd.NA}
     )
     df.loc[
         df.people_fully_vaccinated.isnull(), "people_fully_vaccinated_per_hundred"
     ] = pd.NA
     df.loc[:, "total_boosters"] = df.total_boosters.replace({0: pd.NA})
     df.loc[df.total_boosters.isnull(), "total_boosters_per_hundred"] = pd.NA
     return df.drop(columns=["population"])

Example #44

0

Show file

File: preprocessing.py Project: elianeroosli/mimic-fairness-generalizability-casestudy

def assemble_episodic_data(stays, diagnoses):
    data = {
        'Icustay': stays.ICUSTAY_ID,
        'Age': stays.AGE,
        'Length of Stay': stays.LOS,
        'Mortality': stays.MORTALITY
    }
    # update: adds element(s) to the dict if key new, otherwise updates value
    data.update(transform_gender(stays.GENDER))
    data.update(transform_ethnicity(stays.ETHNICITY))
    data.update(transform_insurance(stays.INSURANCE))
    data['Height'] = np.nan
    data['Weight'] = np.nan
    data = DataFrame(data).set_index('Icustay')
    # reorder columns
    data = data[[
        'Ethnicity', 'Gender', 'Insurance', 'Age', 'Height', 'Weight',
        'Length of Stay', 'Mortality'
    ]]
    return data.merge(extract_diagnosis_labels(diagnoses),
                      left_index=True,
                      right_index=True)

Example #45

0

Show file

File: data_validation.py Project: cheth-rowe/ihmexp

def validate_population_matches_data(population_df: pd.DataFrame,
                                     square_df: pd.DataFrame) -> None:
    """
    Validates that population estimate demographics match the square. A
    mismatch is possible when the population estimates are incorrect or when
    a modeler tries to run ST-GPR with demographics that are not present in the
    population estimates.
    """
    merged_df = square_df.merge(population_df, on=columns.DEMOGRAPHICS)
    if len(merged_df) != len(square_df):
        square_indices = square_df.set_index(columns.DEMOGRAPHICS).index
        merged_indices = merged_df.set_index(columns.DEMOGRAPHICS).index
        missing_rows = square_df[~square_indices.isin(merged_indices)]
        sample_missing_row = missing_rows[columns.DEMOGRAPHICS].iloc[0]
        raise ValueError(
            'There is a mismatch between the population estimate demographics '
            'and the square. The population estimates have '
            f'{len(population_df)} rows, and the square has {len(square_df)} '
            'rows. After merging population estimates with the square, there '
            f'are {len(merged_df)} rows. An example of a row that is present '
            'in the square but is missing from the population estimates is '
            f'{sample_missing_row.to_dict()}')

Example #46

0

Show file

    def n_fold_fit(self,train_data,cols,cate_col,test_data=None,label_col='Label',is_pred=True):
        #train by k_fold
        result_data=DataFrame()
        if is_pred:
            result_data['weight']=[0]*test_data.shape[0]
        fea_filter =[]
        n_split=10
        rank=0

        k=StratifiedKFold(n_splits=n_split,random_state=self.random_state,shuffle=True)

        all_feature_important=DataFrame()
        all_feature_important['feature']=cols
        for train_idx,test_idx in tqdm(k.split(train_data[cols],train_data[label_col]),desc='k_split_fitting'):
            X_train=train_data[cols].loc[train_idx]
            X_vail=train_data[cols].loc[test_idx]

            y_train=train_data[[label_col]].loc[train_idx]
            y_vail=train_data[[label_col]].loc[test_idx]

            if is_pred:
                result_,zero_fea,feature_important=self.model_fit(X_train=X_train,y_train=y_train,X_vail=X_vail,y_vail=y_vail,test_data=test_data[cols],cate_fea=cate_col,is_pred=is_pred)
                result_data['result_'+str(rank)]=result_['result']
                result_data['weight_'+str(rank)]=result_['weight']
                result_data['weight']+=result_['weight']
                del result_
                gc.collect()

            if not is_pred:
                zero_fea,feature_important=self.model_fit(X_train=X_train,y_train=y_train,X_vail=X_vail,y_vail=y_vail,cate_fea=cate_col,is_pred=is_pred)

            feature_important.columns=['feature']+[str(col)+'_'+str(rank) for col in feature_important.columns.tolist()[1:]]
            all_feature_important=all_feature_important.merge(feature_important,'left',on=['feature'])
            fea_filter.append(zero_fea)
            rank+=1

        np.save(self.save_folder+'zero_feature',fea_filter)

        return result_data,n_split,all_feature_important,fea_filter

Example #47

0

Show file

File: encoders.py Project: seahrh/sgcharts-ml

def group_features(
    df: pd.DataFrame,
    statistics: pd.DataFrame,
    column: str,
    group_columns: Iterable[str],
    dtype=np.float32,
) -> pd.DataFrame:
    res = df.merge(statistics,
                   how="left",
                   left_on=group_columns,
                   right_index=True)
    eps = np.finfo(dtype).eps
    for statistic_column in statistics.columns:
        ratio_col = f"{statistic_column}_ratio"
        diff_col = f"{statistic_column}_diff"
        # Prevent division-by-zero error
        res[ratio_col] = res[column] / res[statistic_column].replace(0, eps)
        res[diff_col] = res[column] - res[statistic_column]
        res[statistic_column] = res[statistic_column].astype(dtype)
        res[ratio_col] = res[ratio_col].astype(dtype)
        res[diff_col] = res[diff_col].astype(dtype)
    return res

Example #48

0

Show file

def create_master_table(
    shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
    """Combines all data to create a master table.

        Args:
            shuttles: Preprocessed data for shuttles.
            companies: Preprocessed data for companies.
            reviews: Source data for reviews.
        Returns:
            Master table.

    """
    rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")

    with_companies = rated_shuttles.merge(
        companies, left_on="company_id", right_on="id"
    )

    master_table = with_companies.drop(["shuttle_id", "company_id"], axis=1)
    master_table = master_table.dropna()
    return master_table

Example #49

0

Show file

def get_pairs_dataset(
        dataset: pd.DataFrame = None,
        task: str = '',
        corpus: str = '',
        query_limit: int = 0
) -> pd.DataFrame:
    projection = get_task_dataset_projection(task)
    # Query only non rejected documents
    query = {'rejected': False}
    df_task_dataset = load_dataframe_from_mongodb(
        database_name=corpus,
        collection_name=f"normalized_clear",
        query=query,
        projection=projection,
        sort_by='creation_ts',
        query_limit=query_limit
    )

    df_task_dataset['bug_id'] = pd.to_numeric(df_task_dataset['bug_id'])
    dataset_merged = dataset.merge(df_task_dataset, how='cross', suffixes=('_left', '_right'))

    return dataset_merged

Example #50

0

Show file

def add_aggregated_columns(
        df: pd.DataFrame,
        group_parameters: Dict[str, str] = COLUMNS_GROUPED_BY,
        groupby_aggregators: List = GROUPBY_AGGREGATORS,
        columns_to_replace: List[str] = None) -> (pd.DataFrame, List[str]):
    """
    Create aggregated columns to deal with missing values and non-numerical values
    :param df: input table
    :param group_parameters: parameter and column to group for.
    :param groupby_aggregators: aggregate function to use
    :param columns_to_replace: original columns to be replaced with grouped values
    :return df: dataframe with new aggregated columns
    :return column_names: names of added aggregated columns
    """
    aggregated_column_names = []
    for key, value in group_parameters.items():
        df_grp = df.loc[:, [key, value]].dropna(axis=0).groupby(
            value, as_index=False)[key].agg(groupby_aggregators)
        column_names = [
            f"{value}_{AGGREGATOR_COLUMNS[aggregator]}_{key}"
            for aggregator in groupby_aggregators
        ]
        df = df.merge(df_grp, on=value, how='left')
        aggregated_column_names += column_names
        df.rename(columns=dict(
            zip([
                AGGREGATOR_COLUMNS[aggregator]
                for aggregator in groupby_aggregators
            ], column_names)),
                  inplace=True)
    # drop at the end so order in group_parameters not important
    if columns_to_replace is not None:
        df.drop(labels=[c for c in columns_to_replace], axis=1, inplace=True)
        df.rename(columns={
            'property_subtype_median_facades_number': "facades_number"
        },
                  inplace=True)  #24/11/20 fast fix
    return df, column_names

Example #51

0

Show file

def merge_custom_inputs_onto_square(
        square_df: pd.DataFrame, custom_covariates_df: pd.DataFrame,
        custom_stage_1_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds custom covariates or custom stage 1 onto square.

    Args:
        square_df: the square dataframe
        custom_covariates_df: a possibly empty custom covariates dataframe
        custom_stage_1_df: a possibly empty custom stage 1 dataframe

    Returns:
        Square dataframe with custom covariates or custom stage 1 merged on

    Raises:
        ValueError: if custom covariates or custom stage 1 is not square
    """
    if custom_covariates_df is None and custom_stage_1_df is None:
        return square_df

    to_join = (custom_covariates_df
               if custom_covariates_df is not None else custom_stage_1_df)
    square_with_custom_df = square_df.merge(to_join, on=columns.DEMOGRAPHICS)
    if len(square_with_custom_df) != len(square_df):
        square_indices = square_df.set_index(columns.DEMOGRAPHICS).index
        merged_indices = square_with_custom_df.set_index(
            columns.DEMOGRAPHICS).index
        missing_rows = square_df[~square_indices.isin(merged_indices)]
        sample_missing_row = missing_rows[columns.DEMOGRAPHICS].iloc[0]
        raise ValueError(
            'Custom inputs are not square: your custom inputs have '
            f'{len(to_join)} rows, and the square has {len(square_df)} rows. '
            'After merging custom inputs with the square, there are '
            f'{len(square_with_custom_df)} rows. An example of a row that is '
            'present in the square but is missing from your custom inputs is '
            f'{sample_missing_row.to_dict()}')

    return square_with_custom_df

Example #52

0

Show file

File: feature_transform.py Project: TIXhjq/ML_Function

    def stat_fea(self,
                 df: DataFrame,
                 cate_fea_list: list,
                 num_fea_list: list,
                 data_sign: str = '',
                 agg_param=['mean', 'sum', 'std'],
                 is_format_cate_input=False,
                 is_save_df=True):
        '''
        :param cate_fea_list:   input_format=[[],[],[]]
        :param data_sign:       give fea data sign,default=''
        '''
        if is_format_cate_input:
            cate_fea_list = [[col] for col in cate_fea_list]
        cate_len = len(cate_fea_list)
        stat_fea_list = []

        for cate_fea in tqdm(cate_fea_list, desc='by cate stat'):
            cate_len -= 1
            by_agg_data = DataFrame(
                df.groupby(cate_fea)[num_fea_list].agg(
                    agg_param)).reset_index()
            for num_fea in tqdm(num_fea_list,
                                desc='_'.join(cate_fea) + '_stat_num_fea' +
                                ' rest:' + str(cate_len)):
                agg_cols = [
                    data_sign + '_by_' + '_'.join(cate_fea) + '_on_' +
                    num_fea + '_' + agg_operator for agg_operator in agg_param
                ]
                agg_data_ = by_agg_data[num_fea]
                agg_ = DataFrame(data=agg_data_.values, columns=agg_cols)
                agg_[cate_fea] = by_agg_data[cate_fea]
                if is_save_df:
                    df = df.merge(agg_, 'left', on=cate_fea)
                else:
                    df = agg_
                stat_fea_list += agg_cols
        return df, stat_fea_list

Example #53

0

Show file

    def execute(self, context):
        from norm.engine import QuantifiedLambda
        if not isinstance(self.lam.cloned_from, QuantifiedLambda):
            inp = self.lam.cloned_from.data
        else:
            inp = self.lam.cloned_from.execute(context)
        if isinstance(inp, (DataFrame, Series)):
            if inp.index.name == self.lam.VAR_OID:
                inp = inp.reset_index()
        elif isinstance(inp, Index):
            inp = DataFrame(data=inp)
        equal_cols = list(self.equalities.items())
        left_col, right_col = equal_cols.pop()
        to_merge = self.prepare_to_merge()
        joined = inp.merge(to_merge,
                           left_on=left_col,
                           right_on=self.outputs.get(right_col, right_col),
                           how='left')
        if right_col not in self.outputs:
            joined = joined.drop(columns=[right_col])
        if self.lam.VAR_OID not in joined.columns:
            joined.index.name = self.lam.VAR_OID
        else:
            joined = joined.set_index(self.lam.VAR_OID)
        condition = ' & '.join('({} == {})'.format(
            left_col, self.outputs.get(right_col, right_col))
                               for left_col, right_col in equal_cols)
        if condition != '':
            results = joined.query(condition)
        else:
            results = joined

        results = results.drop(columns=[
            right_col for left_col, right_col in equal_cols
            if right_col not in self.outputs
        ])
        self.lam.data = results
        return results

Example #54

0

Show file

File: clean.py Project: chrislarabee/spreadsheet-tamer

def accrete(
        df: pd.DataFrame,
        accrete_group_by: list,
        accretion_cols: (str, tuple),
        accretion_sep: str = " ",
) -> tuple:
    """
    Groups the dataframe by the passed group_by values and then
    combines text values in the accretion columns.

    Args:
        df: A DataFrame.
        accrete_group_by: A list of columns to group by.
        accretion_cols: The columns you want to accrete on within
            groups created by accrete_group_by.
        accretion_sep: A string indicating how you want the combined
            string values to be separated.

    Returns: The transformed DataFrame, and a metadata dictionary.

    """
    accretion_cols = u.tuplify(accretion_cols)
    md = u.gen_empty_md_df(df.columns)
    for c in accretion_cols:
        df[c] = df[c].fillna("")
        df[c] = df[c].astype(str)
        result = df.groupby(accrete_group_by)[c].apply(
            accretion_sep.join).reset_index()
        df = df.merge(result, on=accrete_group_by, suffixes=("", "_x"))
        cx = c + "_x"
        md[c] = (df[c] != df[cx]).sum()
        df[c] = df[cx]
        df.drop(columns=cx, inplace=True)
        df[c] = df[c].str.strip()
        df[c] = df[c].apply(
            lambda x: x if len(x) > 0 and x[-1] != accretion_sep else x[:-1])
        df[c] = df[c].replace("", nan)
    return df, {"metadata": md}

Example #55

0

Show file

    def setUp(self):
        super(GeoResourceTest, self).setUp()

        self.ts = TestSetupHelpers()
        self.lt = LocationType.objects.create(name='Region', admin_level=1)

        self.distr, created = \
            LocationType.objects.get_or_create(name='District',admin_level = 2)

        self.o = self.ts.create_arbitrary_office()
        location_df_from_csv = read_csv(
            'rhizome/tests/_data/locations_nimroz.csv')
        locations = self.ts.model_df_to_data(location_df_from_csv, Location)

        # make sure that the proper level is set for the
        locs = Location.objects.filter(parent_location_id=6)
        for loc in locs:
            loc.location_type_id = self.distr.id
            loc.save()

        parent = Location.objects.get(id=6)
        parent.location_type_id = self.lt.id
        parent.save()

        geo_json_df = read_csv('rhizome/tests/_data/geo_json_small.txt',
                               delimiter="|")
        location_df = DataFrame(list(Location.objects.all()\
      .values_list('id','location_code')),columns=['location_id','location_code'])
        location_tree_df = DataFrame(list(Location.objects.all()\
      .values_list('id','parent_location_id')),columns=['location_id','parent_location_id'])
        location_tree_df['lvl'] = Series(1, index=location_tree_df.index)
        location_tree = self.ts.model_df_to_data(location_tree_df,
                                                 LocationTree)
        merged_df = location_df.merge(geo_json_df)[['location_id', 'geo_json']]
        self.ts.model_df_to_data(merged_df, LocationPolygon)
        minify_geo_json()
        LocationPermission.objects.create(user_id=self.ts.user.id,
                                          top_lvl_location_id=1)

Example #56

0

Show file

File: fe_unemployment.py Project: kiccho1101/kaggle_m5_forecasting

def read_unemployment_data(
    date_range: pd.DataFrame, external_data_path: str = "./external_data"
) -> pd.DataFrame:
    files: Dict[str, int] = {
        "CA.csv": 0,
        "TX.csv": 1,
        "WI.csv": 2,
    }

    unemployment: pd.DataFrame = pd.DataFrame()
    with timer("load unemployment data"):
        if os.path.exists(f"{external_data_path}/unemployment"):
            for file_name, state_id in files.items():
                _tmp_unemployment = pd.read_csv(
                    f"{external_data_path}/unemployment/{file_name}"
                )
                _tmp_unemployment["date"] = pd.to_datetime(
                    _tmp_unemployment["DATE"]
                ).dt.strftime("%Y-%m-%d")
                _tmp_unemployment.drop("DATE", axis=1, inplace=True)
                _tmp_unemployment.rename(
                    {"{}UR".format(file_name.replace(".csv", "")): "fe_unemployment"},
                    axis=1,
                    inplace=True,
                )
                _tmp_unemployment = date_range.merge(
                    _tmp_unemployment, on="date", how="left"
                )
                _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[
                    "fe_unemployment"
                ].interpolate()
                _tmp_unemployment["fe_unemployment"] = _tmp_unemployment[
                    "fe_unemployment"
                ].fillna(method="bfill")
                _tmp_unemployment["state_id"] = state_id
                unemployment = pd.concat([unemployment, _tmp_unemployment], axis=0)
                del _tmp_unemployment
    return unemployment

Example #57

0

Show file

File: pricer.py Project: atoti/notebooks

def optimize_prices(
        initial_price_list: pandas.DataFrame,
        shop_features_with_clusters: pandas.DataFrame) -> pandas.DataFrame:
    """
    This pricing function has been made very simple as the main purpose is to serve the use case example.
    In real life this would typically be replaced by a more flexible price optimization engine with several parameters,
    such as that of ActiveViam.

    Args:
        initial_price_list: The initial SellingPrice list
        sotre_features_with_clusters: A dataframe containing informations about the stores, including their cluster

    Returns:
        A new SellingPrice list with an optimized price stores with low competition have increased prices while those with high competition have competitive prices.
    """

    new_price_list = initial_price_list.merge(shop_features_with_clusters,
                                              left_on="ShopId",
                                              right_on="ShopId")

    new_price_list.loc[(new_price_list["Cluster"] == 0),
                       "SellingPrice"] = (new_price_list["SellingPrice"] *
                                          1.07)
    new_price_list.loc[(new_price_list["Cluster"] == 1),
                       "SellingPrice"] = (new_price_list["SellingPrice"] * 1.3)
    new_price_list.loc[(new_price_list["Cluster"] == 2),
                       "SellingPrice"] = (new_price_list["SellingPrice"] *
                                          0.95)
    new_price_list.loc[(new_price_list["Cluster"] == 3),
                       "SellingPrice"] = (new_price_list["SellingPrice"] *
                                          1.02)
    new_price_list.loc[(new_price_list["Cluster"] == 4),
                       "SellingPrice"] = (new_price_list["SellingPrice"] *
                                          1.07)

    return new_price_list[[
        "ProductId", "ShopId", "SellingPrice", "PurchasePrice", "Quantity"
    ]]

Example #58

0

Show file

def flag_imputed_data(statcast_df: pd.DataFrame) -> pd.DataFrame:
    """Function to flag possibly imputed data as a result of no-nulls approach (see: https://tht.fangraphs.com/43416-2/)
       For derivation of values see pybaseball/EXAMPLES/imputed_derivation.ipynb
       Note that this imputation only occured with TrackMan, not present in Hawk-Eye data (beyond 2020)
    Args:
        statcast_df (pd.DataFrame): Dataframe loaded via statcast.py, statcast_batter.py, or statcast_pitcher.py
    Returns:
        pd.DataFrame: Copy of original dataframe with "possible_imputation" flag
    """

    ParameterSet = namedtuple('ParameterSet', ["ev", "angle", "bb_type"])
    impute_combinations = []

    # pop-ups
    impute_combinations.append(ParameterSet(ev=80.0, angle=69.0, bb_type="popup"))

    # Flyout
    impute_combinations.append(ParameterSet(ev=89.2, angle=39.0, bb_type="fly_ball"))
    impute_combinations.append(ParameterSet(ev=102.8, angle=30.0, bb_type="fly_ball"))

    # Line Drive
    impute_combinations.append(ParameterSet(ev=90.4, angle=15.0, bb_type="line_drive"))
    impute_combinations.append(ParameterSet(ev=91.1, angle=18.0, bb_type="line_drive"))

    # Ground balls
    impute_combinations.append(ParameterSet(ev=82.9, angle=-21.0, bb_type="ground_ball"))
    impute_combinations.append(ParameterSet(ev=90.3, angle=-17.0, bb_type="ground_ball"))


    df_imputations = pd.DataFrame(data=impute_combinations)
    df_imputations["possible_imputation"] = True
    df_return = statcast_df.merge(df_imputations, how="left",
                                  left_on=["launch_speed", "launch_angle", "bb_type"],
                                  right_on=["ev", "angle", "bb_type"])
    # Change NaNs to false for boolean consistency
    df_return["possible_imputation"] = df_return["possible_imputation"].fillna(False)
    df_return = df_return.drop(["ev", "angle"], axis=1)
    return df_return

Example #59

0

Show file

File: pathway_analyzer.py Project: ecotox/pacfm

    def normalize_by_pathway(self, pathway_feature, level=3): 
        """
            pathway_feature can be n_protein or sequence_lenght
        """
        self.set_pathway_info()

        pathway_factor= {c.name: c.db_data[pathway_feature] for c in self.ideograms[level-1].chromosomes}
        nLevels= self.biodb_selector.getLevelCount()
        
        df_lengths= DataFrame(pathway_factor.items(), index= range(len(pathway_factor)), columns=["Level %s" % level,"Length"])
        #return df_lengths
        ## by merging acording to the lengths dataframe, we also 
        ## filter the main dataframe in the meanwhile
        
        df_merged= df_lengths.merge(self.data_frame, on= "Level %s" % level)
        ### columns have one extra level for accession and one more
        ### for the lengths in df_merged
        
        df_merged[self.data_frame.columns[nLevels+1:]]=  df_merged[self.data_frame.columns[nLevels+1:]].divide(df_merged['Length'].values, axis= 0)
        
        df_normalized= df_merged[self.data_frame.columns]       
        self.update_ideograms_by_dataframe(df_normalized)
        self.update_dataframe()

Example #60

0

Show file

File: bandits.py Project: NameArtem/pydata_demo

    def get_services_weights(metrics: pd.Series,
                             services: pd.DataFrame) -> Dict:
        """
        calculate weights for load balancer for each service,
        weights are proportional to precision score but not less than MIN_WEIGHT,
        sum of weights for all services approximately equal to WEIGHT_SCALE

        if service use replication, weight will be divided by replicas number

        :param metrics: dict with model as key and p as value
        :param services: dataframe with active services (model, n_replicas, service)
        :return: dict with service as a key and weight as value
        """
        services = services.merge(pd.DataFrame(metrics), on='model')

        services['weight'] = (services['p'] / services['p'].sum() *
                              WEIGHT_SCALE / services['replicas']).fillna(0)

        services['weight'] = services['weight'].astype(int)
        services.loc[services['weight'] == 0, 'weight'] = MIN_WEIGHT

        return services[['service',
                         'weight']].set_index('service').to_dict()['weight']