Example #1
0
    def test_merge_datetime_index(self, box):
        # see gh-19038
        df = DataFrame([1, 2, 3],
                       ["2016-01-01", "2017-01-01", "2018-01-01"],
                       columns=["a"])
        df.index = pd.to_datetime(df.index)
        on_vector = df.index.year

        if box is not None:
            on_vector = box(on_vector)

        expected = DataFrame(
            OrderedDict([
                ("a", [1, 2, 3]),
                ("key_1", [2016, 2017, 2018]),
            ])
        )

        result = df.merge(df, on=["a", on_vector], how="inner")
        tm.assert_frame_equal(result, expected)

        expected = DataFrame(
            OrderedDict([
                ("key_0", [2016, 2017, 2018]),
                ("a_x", [1, 2, 3]),
                ("a_y", [1, 2, 3]),
            ])
        )

        result = df.merge(df, on=[df.index.year], how="inner")
        tm.assert_frame_equal(result, expected)
def returns_customer_product_time(pwunsale_tidy, pw_ytdcust, pw_cusattr):
    '''
    Meant to feed into a Pivot requested by Mitch Turner.
    
    Aggregates the same as above but includes time and product data.
    '''
    dat = pwunsale_tidy['Date'].tolist()
    pwunsale_tidy['Month'] = [d.strftime('%B') for d in dat]    
    
    print('Aggregating custom pivot for Mitch.')
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    custom_cols = ['Month','CustomerId','Customer','ProductId','Product']    
    
    customer_returns = DataFrame(pwunsale_tidy.groupby(custom_cols)[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)

    return customer_returns
Example #3
0
    def process_location_tree_lvl(self, location_type_id):
        '''
        Get and process data for a particular location type ( admin level ).
        '''

        lt_batch = []
        df_columns = ['location_id', 'parent_location_id']
        location_df = DataFrame(list(Location.objects
                             .filter(location_type_id=location_type_id)
                             .values_list('id', 'parent_location_id')),\
                              columns= df_columns)
        location_df['lvl'] = 1 # since this is a direct parent child relation

        merged_df = location_df.merge(self.location_tree_df\
                        ,left_on='location_id'\
                        ,right_on='parent_location_id')

        cleaned_merge_df = merged_df[['location_id_y', 'parent_location_id_x'\
            ,'lvl_y']]

        cleaned_merge_df['lvl_y'] = cleaned_merge_df['lvl_y'] + 1
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df, location_df,
                                        cleaned_merge_df])
        self.location_tree_df.drop_duplicates()
Example #4
0
    def main(self):
        '''
        Find the office id for each indicator.
        '''

        ## find the data for the indicators requested ##
        df = DataFrame(list(DataPointComputed.objects.filter(indicator_id__in=\
            self.indicator_id_list).values_list('indicator_id','campaign_id')\
            .distinct()),columns=['indicator_id','campaign_id'])

        ## find all campaigns + office combinations
        office_lookup_df = DataFrame(list(Campaign.objects.all()\
            .values_list('id','office_id')),columns=['campaign_id','office_id'])

        ## Join the two dataframes and take the distinct office, indicator ##
        joined_df = df.merge(office_lookup_df)
        unique_df = joined_df[['indicator_id','office_id']].drop_duplicates()

        ## iterrate throught the DF, create objects and prep for bulk_create
        ind_to_office_batch = []
        for ix, data in unique_df.iterrows():
            ind_to_office_batch.append(IndicatorToOffice(**data.to_dict()))

        ## delete then re-insert  ##
        IndicatorToOffice.objects.filter(indicator_id__in = \
            self.indicator_id_list).delete()
        IndicatorToOffice.objects.bulk_create(ind_to_office_batch)
Example #5
0
    def test_join_dups(self):

        # joining dups
        df = concat([DataFrame(np.random.randn(10, 4),
                               columns=['A', 'A', 'B', 'B']),
                     DataFrame(np.random.randint(0, 10, size=20)
                               .reshape(10, 2),
                               columns=['A', 'C'])],
                    axis=1)

        expected = concat([df, df], axis=1)
        result = df.join(df, rsuffix='_2')
        result.columns = expected.columns
        assert_frame_equal(result, expected)

        # GH 4975, invalid join on dups
        w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
        z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])

        dta = x.merge(y, left_index=True, right_index=True).merge(
            z, left_index=True, right_index=True, how="outer")
        dta = dta.merge(w, left_index=True, right_index=True)
        expected = concat([x, y, z, w], axis=1)
        expected.columns = ['x_x', 'y_x', 'x_y',
                            'y_y', 'x_x', 'y_x', 'x_y', 'y_y']
        assert_frame_equal(dta, expected)
Example #6
0
    def calculate_weights(self, services: pd.DataFrame) -> Dict:
        """
        calculate weights for load balancer for each service,
        weights are proportional to precision score but not less than MIN_WEIGHT,
        sum of weights for all services approximately equal to WEIGHT_SCALE

        if service use replication, weight will be divided by replicas number

        :param services: dataframe with active services (block, model, n_replicas, service)
        :return: dict with service as a key and weight as value
        """
        start_time = int(time.time() * 1000) - TIME_WINDOW

        for block in services['block'].unique():
            if block:
                score = self.r.xrange(block, min=start_time, max='+')

        score = pd.DataFrame([x for _, x in score])
        score.columns = [c.decode() for c in score.columns]
        score['value'] = score['value'].astype(int)
        score['model'] = score['model'].apply(lambda x: x.decode())
        score = score.groupby('model', as_index=False).mean()
        score = services.merge(score, on='model')
        score['weight'] = (score['value']
                           / score['value'].sum()
                           * WEIGHT_SCALE
                           / score['replicas']
                           ).fillna(0)

        score['weight'] = score['weight'].astype(int)
        score.loc[score['weight'] == 0, 'weight'] = MIN_WEIGHT

        return score.set_index('service').to_dict()['weight']
 def alignTime(self,instruments):
     i = 0
     mergedSeries = None
     for instrument in instruments :
         if i == 0:
             # first instrument do not merge
             i += 1
             # change col name in order not to overlap
             mergedSeries = instruments[instrument][["Date","AdjClose"]]
             newColName = str("%s_Adjclose" %instrument)
             mergedSeries = mergedSeries.rename(columns={'AdjClose': newColName})
         else :
             newSeries = instruments[instrument][["Date","AdjClose"]]
             newColName = str("%s_Adjclose" %instrument)
             newSeries = newSeries.rename(columns={'AdjClose': newColName})
             
             mergedSeries = pd.merge(mergedSeries,newSeries, on="Date", how = "inner")
     mergedSeries.dropna()
     # put result into Dict, and recover Name
     alignedSeries = dict()
     for instrument in instruments :
         colName = str("%s_Adjclose" %instrument)
         alignedSeries[instrument] = mergedSeries[["Date",colName]].rename(columns={colName : 'AdjClose'})
     mergedSeries = mergedSeries.set_index(mergedSeries["Date"].values)
     #mergedSeries.plot()
     #plt.show()
     return alignedSeries
Example #8
0
    def test_merge_na_keys(self):
        data = [[1950, "A", 1.5],
                [1950, "B", 1.5],
                [1955, "B", 1.5],
                [1960, "B", np.nan],
                [1970, "B", 4.],
                [1950, "C", 4.],
                [1960, "C", np.nan],
                [1965, "C", 3.],
                [1970, "C", 4.]]

        frame = DataFrame(data, columns=["year", "panel", "data"])

        other_data = [[1960, 'A', np.nan],
                      [1970, 'A', np.nan],
                      [1955, 'A', np.nan],
                      [1965, 'A', np.nan],
                      [1965, 'B', np.nan],
                      [1955, 'C', np.nan]]
        other = DataFrame(other_data, columns=['year', 'panel', 'data'])

        result = frame.merge(other, how='outer')

        expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
        expected = expected.replace(-999, np.nan)

        tm.assert_frame_equal(result, expected)
def create_summaries(unsaleables_by_product, pw_ytdsupp):
    '''
    Creates useful one-look summaries for management.
    '''
    print('*'*100)
    print('Creating summaries.')
    print('*'*100)    
    
    summary_cols = ['DollarsUnsaleable|sum', 'DollarsReturned|sum', 
                    'CasesUnsaleable|sum', 'CasesReturned|sum']
    
    print('\n\n\nSummarizing Directors.')
    by_director = DataFrame(unsaleables_by_product.groupby('Director')[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False)
    
    print('Summarizing Suppliers.')
    by_supplier = DataFrame(unsaleables_by_product.groupby(['Director','SupplierId','Supplier'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False).reset_index(level=['Director','SupplierId','Supplier'], drop=False)

    print('Merging in YTD sales by supplier and deriving percent of sales.')    
    by_supplier = by_supplier.merge(pw_ytdsupp, on='SupplierId', how='left')
    by_supplier['PercentSales'] = np.divide(by_supplier['DollarsUnsaleable|sum'], by_supplier['DollarSales|bysupplier'])
    
    print('Summarizing by Class.\n\n\n')
    by_class = DataFrame(unsaleables_by_product.groupby(['Class'])[summary_cols].sum()).sort_values('DollarsUnsaleable|sum', ascending=False)

    print('*'*100)
    print('Finished creating summaries.')   
    print('*'*100)
    
    return by_supplier, by_director, by_class
Example #10
0
    def group_by_time_transform(self):
        dp_df_columns = ['data_date','indicator_id','location_id','value']
        time_grouping =  self.parsed_params['group_by_time']

        # HACKK
        if self.parsed_params['chart_uuid'] ==\
            '5599c516-d2be-4ed0-ab2c-d9e7e5fe33be':

            self.parsed_params['show_missing_data'] = 1
            return self.handle_polio_case_table(dp_df_columns)

        cols = ['data_date','indicator_id','location_id','value']
        dp_df = DataFrame(list(DataPoint.objects.filter(
            location_id__in = self.location_ids,
            indicator_id__in = self.parsed_params['indicator__in']
        ).values(*cols)),columns=cols)

        if not dp_df.empty:
            dp_df = self.get_time_group_series(dp_df)
            gb_df = DataFrame(dp_df\
                .groupby(['indicator_id','time_grouping','location_id'])['value']\
                .sum())\
                .reset_index()
            return gb_df
        # need to look at sublocations if the data isn't available at the current level
        else:
            depth_level, max_depth, sub_location_ids = 0, 3, self.location_ids
            while dp_df.empty and depth_level < max_depth:
                sub_location_ids = Location.objects\
                    .filter(parent_location_id__in=sub_location_ids)\
                    .values_list('id', flat=True)

                dp_df = DataFrame(list(DataPoint.objects.filter(
                    location_id__in = sub_location_ids,
                    indicator_id__in = self.parsed_params['indicator__in']
                ).values(*cols)),columns=cols)
                depth_level += 1

            dp_df = self.get_time_group_series(dp_df)
            if dp_df.empty:
                return []
            location_tree_df = DataFrame(list(LocationTree.objects\
                .filter(location_id__in = sub_location_ids)\
                .values_list('location_id','parent_location_id')),\
                    columns=['location_id','parent_location_id'])

            merged_df = dp_df.merge(location_tree_df)
            filtered_df = merged_df[merged_df['parent_location_id']\
                .isin(self.location_ids)]

            gb_df = DataFrame(filtered_df\
                .groupby(['indicator_id','time_grouping','parent_location_id'])['value']\
                .sum())\
                .reset_index()

            gb_df = gb_df.rename(columns={'parent_location_id' : 'location_id'})
            return gb_df
def customer_return_summary(pw_cusattr, pwunsale_tidy, pw_ytdcust):
    '''
    Derives intelligence out of MTC1 data 
    on customer returns. 
    '''
    print('*'*100)
    print('Creating summary of returns.')
    print('*'*100)
    
    len_unique = lambda x: len(pd.unique(x))
    agg_funcs_returns = {'ExtCost': {'DollarsReturned|avg':np.mean, 'DollarsReturned|sum':np.sum},
                         'CasesReturned': {'CasesReturned|avg':np.mean, 'CasesReturned|sum':np.sum},
                         'Invoice':len_unique }
    
    print('\n\n\nAggregating tidy dataset.')
    customer_returns = DataFrame(pwunsale_tidy.groupby(['CustomerId','Customer'])[['ExtCost','CasesReturned']].agg(agg_funcs_returns)).reset_index(drop=False)
    customer_returns.rename(columns={'<lambda>':'Returns|count'}, inplace=True) 
    customer_returns.drop('Customer', inplace=True, axis=1)
    
    print('Merging in YTD sales by Customer')
    customer_returns = customer_returns.merge(pw_ytdcust, on='CustomerId', how='left')
    
    print('Deriving returns as a percent of sales for each Customer.')
    customer_returns['PercentSales'] = np.divide(customer_returns['DollarsReturned|sum'], customer_returns['DollarSales|bycustomer'])
    
    print('Merge in customer attributes.')
    customer_returns = customer_returns.merge(pw_cusattr, on='CustomerId', how='left')
    
    print('Sorting in descending order on Dollars returned.')
    customer_returns.sort_values('DollarsReturned|sum', ascending=False, inplace=True)
    
    print('Reorder columns for readability.\n\n\n')
    reorder_cols = ['CustomerId','Customer','Returns|count',
                    'PercentSales','DollarSales|bycustomer',
                    'DollarsReturned|sum','DollarsReturned|avg',
                    'CasesReturned|sum','CasesReturned|avg',
                    'OnPremise','Latitude','Longitude']
    customer_returns = customer_returns[reorder_cols]
    
    print('*'*100)
    print('Finished summarizing returns.')
    print('*'*100)
    
    return customer_returns
def get_merged_data():
	
	with open("resources/data/ner.json") as f:
		ner = DataFrame(json.load(f))
	with open("resources/data/publications.json") as f:
		publications = DataFrame(json.load(f))
	with open("resources/data/sweet.json") as f:
		sweet_features = DataFrame(json.load(f))

	return (ner.merge(publications,on=["id"])).merge(sweet_features,on=["id"]).T.to_dict().values()
def assemble_episodic_data(stays, diagnoses):
    data = { 'Icustay': stays.ICUSTAY_ID, 'Age': stays.AGE, 'Length of Stay': stays.LOS,
                    'Mortality': stays.MORTALITY }
    data.update(transform_gender(stays.GENDER))
    data.update(transform_ethnicity(stays.ETHNICITY))
    data['Height'] = np.nan
    data['Weight'] = np.nan
    data = DataFrame(data).set_index('Icustay')
    data = data[['Ethnicity', 'Gender', 'Age', 'Height', 'Weight', 'Length of Stay', 'Mortality']]
    return data.merge(extract_diagnosis_labels(diagnoses), left_index=True, right_index=True)
def get_merged_data(solrdata):
	
	with open("ner.json") as f:
		ner = DataFrame(json.load(f))
	with open("publications.json") as f:
		publications = DataFrame(json.load(f))
	with open("geotopic.json") as f:
		sweet_features = DataFrame(json.load(f))

	return (ner.merge(publications)).merge(sweet_features).T.to_dict().values()
Example #15
0
 def merge(self, *args, **kwargs):
     result = DataFrame.merge(self, *args, **kwargs)
     geo_col = self._geometry_column_name
     if isinstance(result, DataFrame) and geo_col in result:
         result.__class__ = GeoDataFrame
         result.crs = self.crs
         result._geometry_column_name = geo_col
         result._invalidate_sindex()
     elif isinstance(result, DataFrame) and geo_col not in result:
         result.__class__ = DataFrame
     return result
Example #16
0
 def get_result(self):
     """
     Get the result after processing the work log.
     """
     if (self._data is None) or (len(self._data) == 0) or\
        (self._proj is None) or (len(self._proj) == 0):
         return (None, None, None)
     task_frame = DataFrame(self._data, 
         columns = ['line', 'date', 'hours', 'task'])
     proj_frame = DataFrame(self._proj, 
         columns = ['line', 'date', 'task', 'project'])
     # Group projects assigned to tasks
     task_projs = (task_frame[['task']])\
         .merge(proj_frame[['task', 'project']], how='outer', on='task')\
         .drop_duplicates(['task', 'project'])\
         .groupby('task').project
     # Filter tasks without projects
     task_frame = task_frame.merge(proj_frame[['task']]\
         .drop_duplicates('task'))
     # Construct task table
     task_table = (task_frame[['task', 'date', 'hours']])\
         .groupby(['task', 'date']).sum()\
         .unstack()['hours'].fillna(0.0)
     # Assign tasks to projects
     join_frame = task_frame.merge(proj_frame, 
         how='inner', on='task', suffixes=['_task', '_proj'])
     join_frame = \
         (join_frame[join_frame.line_task <= join_frame.line_proj])\
         .sort(columns=['line_task', 'line_proj'])\
         .drop_duplicates('line_task')
     # Construct project table
     proj_table = (join_frame[['date_task', 'project', 'hours']])\
         .groupby(['project', 'date_task']).sum()\
         .unstack()['hours'].fillna(0.0)
     proj_table.columns.name = 'date'
     # Set totals
     task_table['TOTAL'] = task_table.sum(axis=1)
     proj_table['TOTAL'] = proj_table.sum(axis=1)
     task_table.ix['TOTAL'] = task_table.sum()
     proj_table.ix['TOTAL'] = proj_table.sum()
     return (task_table, proj_table, task_projs)
Example #17
0
class CubeJoin(object):
    def __init__(self, cube):
        self.cube = cube
        self.data = DataFrame({})
        method = getattr(self, cube.get('cube_join_type', 'none'))
        method()

    def inner(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        DW = DataWarehouse()
        for i, rel in enumerate(self.cube.get('relationship')):
            data = DW.get(rel['cube']).get('data')
            df = DataFrame(data)
            if i == 0:
                self.data = df
            else:
                self.data = self.data.merge(df, how='inner', on=fields[0])
        return self.data

    def left(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        self.data = DataFrame({fields[0]: []})
        DW = DataWarehouse()
        for rel in self.cube.get('relationship'):
            data = DW.get(rel['cube'])
            self.data = self.data.merge(DataFrame(data.get('data')),
                how='outer', on=fields[0])
        return self.data

    def append(self):
        self.data = DataFrame({})
        DW = DataWarehouse()
        self.data.append([DataFrame(
                          DW.get(rel['cube']).get('data'))
                          for rel in self.cube.get('relationship')],
                         ignore_index=True)
        return self.data

    def none(self):
        return self.data
def create_indirect_links_once(df: pd.DataFrame) -> pd.DataFrame:
    """ This function gets a Dataframe as input.

        The function then merges the Dataframe with itself on given keys.
        The function returns the Dataframe with newly added lines that result from indirect links.
    """

    # merge the Dataframe with itself based on keys of input study etc. and output study.
    # two rows match if the contents of the left side match the contents of the right side.

    # row 1
    # input_study, input_dataset, input_version, input_variable
    # 1, 1, 1, 1

    # matches row 2
    # output_study, output_dataset, output_version, output_variable
    # 1, 1, 1, 1

    temp = df.merge(
        df,
        right_on=["input_study", "input_dataset", "input_version", "input_variable"],
        left_on=["output_study", "output_dataset", "output_version", "output_variable"],
    )
    WANTED_COLUMNS = [
        "input_study_x",
        "input_dataset_x",
        "input_version_x",
        "input_variable_x",
        "output_study_y",
        "output_dataset_y",
        "output_version_y",
        "output_variable_y",
    ]
    # select only the columns for
    # input study etc. from the left Dataframe and the output study etc. from the right Dataframe
    temp = temp[WANTED_COLUMNS]

    # Rename the rows to be of the original format
    RENAME_COLUMNS = {
        "input_study_x": "input_study",
        "input_dataset_x": "input_dataset",
        "input_version_x": "input_version",
        "input_variable_x": "input_variable",
        "output_study_y": "output_study",
        "output_dataset_y": "output_dataset",
        "output_version_y": "output_version",
        "output_variable_y": "output_variable",
    }
    temp.rename(columns=RENAME_COLUMNS, inplace=True)

    # add new rows to the original Dataframe, dropping duplicates
    return df.append(temp).drop_duplicates().reset_index(drop=True)
Example #19
0
    def test_join_float64_float32(self):

        a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64)
        b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32)
        joined = a.join(b)
        assert joined.dtypes['a'] == 'float64'
        assert joined.dtypes['b'] == 'float64'
        assert joined.dtypes['c'] == 'float32'

        a = np.random.randint(0, 5, 100).astype('int64')
        b = np.random.random(100).astype('float64')
        c = np.random.random(100).astype('float32')
        df = DataFrame({'a': a, 'b': b, 'c': c})
        xpdf = DataFrame({'a': a, 'b': b, 'c': c})
        s = DataFrame(np.random.random(5).astype('float32'), columns=['md'])
        rs = df.merge(s, left_on='a', right_index=True)
        assert rs.dtypes['a'] == 'int64'
        assert rs.dtypes['b'] == 'float64'
        assert rs.dtypes['c'] == 'float32'
        assert rs.dtypes['md'] == 'float32'

        xp = xpdf.merge(s, left_on='a', right_index=True)
        assert_frame_equal(rs, xp)
Example #20
0
    def setUp(self):
        super(GeoResourceTest, self).setUp()

        self.ts = TestSetupHelpers()
        self.lt = LocationType.objects.create(name='Region',admin_level=1)

        self.distr, created = \
            LocationType.objects.get_or_create(name='District',admin_level = 2)

        self.planet_location_type = LocationType.objects\
            .create(name = 'Planet', admin_level = 0)

        self.ultimate_parent = Location.objects.create(
            id = 1,
            name = 'Earth',
            location_code = 'Earth',
            location_type_id = self.planet_location_type.id
        )

        location_df_from_csv= read_csv('rhizome/tests/_data/locations_nimroz.csv')
        self.ts.model_df_to_data(location_df_from_csv,Location)

        # make sure that the proper level is set for the
        locs = Location.objects.filter(parent_location_id=6)
        for loc in locs:
            loc.location_type_id = self.distr.id
            loc.save()

        parent = Location.objects.get(id=6)
        parent.location_type_id = self.lt.id
        parent.save()

        geo_json_df = read_csv('rhizome/tests/_data/geo_json_small.txt',delimiter = "|")
        location_df = DataFrame(list(Location.objects.all()\
		    .values_list('id','location_code')),columns=['location_id','location_code'])
        location_tree_df = DataFrame(list(Location.objects.all()\
		    .values_list('id','parent_location_id'))\
            ,columns=['location_id','parent_location_id'])

        location_tree_df['parent_location_id'].fillna(self.ultimate_parent.id,\
            inplace=True)

        location_tree_df['lvl'] = Series(1, index=location_tree_df.index)
        self.ts.model_df_to_data(location_tree_df, LocationTree)
        merged_df = location_df.merge(geo_json_df)[['location_id','geo_json']]
        self.ts.model_df_to_data(merged_df, LocationPolygon)
        minify_geo_json()

        LocationPermission.objects.create(user_id = self.ts.user.id,
            top_lvl_location_id = 1)
Example #21
0
    def test_metadata_propagation_indiv(self):

        # groupby
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        result = df.groupby('A').sum()
        self.check_metadata(df,result)

        # resample
        df = DataFrame(np.random.randn(1000,2),
                       index=date_range('20130101',periods=1000,freq='s'))
        result = df.resample('1T')
        self.check_metadata(df,result)

        # merging with override
        # GH 6923
        _metadata = DataFrame._metadata
        _finalize = DataFrame.__finalize__

        np.random.seed(10)
        df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b'])
        df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd'])
        DataFrame._metadata = ['filename']
        df1.filename = 'fname1.csv'
        df2.filename = 'fname2.csv'

        def finalize(self, other, method=None, **kwargs):

            for name in self._metadata:
                if method == 'merge':
                    left, right = other.left, other.right
                    value = getattr(left, name, '') + '|' + getattr(right, name, '')
                    object.__setattr__(self, name, value)
                else:
                    object.__setattr__(self, name, getattr(other, name, ''))

            return self

        DataFrame.__finalize__ = finalize
        result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner')
        self.assertEquals(result.filename,'fname1.csv|fname2.csv')

        DataFrame._metadata = _metadata
        DataFrame.__finalize__ = _finalize
def populate_fake_dwc_data(apps, schema_editor):
    '''
    This migration will be removed, and we will prefer the "initial_meta_data"
    ingetion and rely on DocTransform, RefreshMaster and AggRefresh in order
    to populate the datapoint_with_computed table.. however, so that we can have
    ample data to show on the dashboards, i will take the cartesion product
    of campaigns, indicators and selected locations ( provinces and LPDS )
    and dump that data in to datapoint_with_computed.

    It would be nice to somehow set this up so that when a new developer spins
    up the app locally.. they can populate this 'fake' data.

    Maybe somethign like.. if SETTINGS.debug = True, then ingest fake data.
    '''

    ind_df = DataFrame(list(Indicator.objects.all()\
        .values_list('id','short_name','data_format')),columns = ['indicator_id','short_name','data_format'])

    campaign_df = DataFrame(list(Campaign.objects.all()\
        .values_list('id','name')),columns = ['campaign_id','campaign_name'])

    country_id_list = list(Location.objects\
        .filter(location_type_id = 1)\
        .values_list('id',flat=True))

    lpd_id_qs = list(Location.objects\
        .filter(lpd_status__in=[1,2])\
        .values_list('id','parent_location_id'))

    province_id_list = [y for x, y in lpd_id_qs]
    lpd_id_list = [x for x, y in lpd_id_qs]

    location_ids = country_id_list + province_id_list + lpd_id_list

    location_df = DataFrame(list(Location.objects\
        .filter(id__in=location_ids)\
        .values_list('id','name')),columns = ['location_id','name'])

    ind_df['join_col'] = 1
    campaign_df['join_col'] = 1
    location_df['join_col'] = 1

    first_merged_df = ind_df.merge(campaign_df,on='join_col')
    final_merged_df = first_merged_df.merge(location_df, on='join_col')

    upsert_df_data(final_merged_df)
Example #23
0
def dfm_A_intersect_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame:
    """
    A - B return the entries which in A and in B based on key cols, it is mainly used to identify duplicate entries and
    then update to DB
    :param A:
    :param B:
    :return:
    """
    if len(B) == 0:
        return DataFrame(columns=list(A.columns))
    B_tmp=B[key_cols].copy()
    B_tmp['tmp_col_duplicated'] = 'Y'
    dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols)
    # dfm_merge_by_keycols.dropna() 这里不能使用dropna,dropna只要这行中有None值,就会删除.会导致误删除.
    dfm_merge_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated == 'Y']
    del dfm_merge_by_keycols['tmp_col_duplicated']
    return dfm_merge_by_keycols
Example #24
0
def junta_tabelas():
    #locais = quebra_secoes()
    votos = arruma_votos()
    print(votos)

    locais = read_csv("locais_com_votacao_trabalhada.csv")
    #votos = read_csv("voto_secao_partido_trabalhada.csv")
    saida = DataFrame.merge(locais,votos, left_on="id",right_on="id",how="outer")

    saida = DataFrame(saida.groupby(["lat","long","aptos_por_local","local_de_votacao","zona_eleitoral_nro","bairro","endereco","secoes_eleitorais","zona_eleitoral_nome"]).sum().reset_index())
    saida = saida.fillna(0)
    saida = saida[saida.secao != 0]
    saida["lat_real"] = saida["long"]
    saida["long"] = saida["lat"]
    saida["lat"] = saida["lat_real"]
    del saida["lat_real"]

    saida.to_csv("secoes_com_votacao.csv",index=False)
Example #25
0
    def process_location_tree_lvl(self, location_type_id):

        lt_batch = []

        location_df = DataFrame(list(Location.objects\
            .filter(location_type_id = location_type_id)\
            .values_list('id','parent_location_id')),columns=self.location_tree_columns)

        merged_df = location_df.merge(self.location_tree_df
            ,left_on='location_id',right_on='parent_location_id')

        cleaned_merge_df = merged_df[['location_id_y','parent_location_id_x']]
        cleaned_merge_df.columns = self.location_tree_columns

        self.location_tree_df = concat([self.location_tree_df,location_df,\
            cleaned_merge_df])

        self.location_tree_df.drop_duplicates()
Example #26
0
    def mark_datapoints_with_needs_campaign(self):

        new_dp_df = DataFrame(list(DataPoint.objects\
            .filter(source_submission_id__in = \
                self.ss_ids_to_process).values()))

        date_series = new_dp_df['data_date']
        mn_date, mx_date = min(date_series).date(), max(date_series).date()

        office_lookup_df = DataFrame(list(Location.objects\
            .filter(id__in = list(set(new_dp_df['location_id'])))\
            .values_list('id','office_id')), \
             columns = ['location_id', 'office_id'])

        campaign_qs = Campaign.objects.filter(
            end_date__gte = mn_date, start_date__lte = mx_date,
            office_id__in = office_lookup_df\
            ['office_id'].unique())

        campaign_df = DataFrame(list(campaign_qs\
            .values('office_id','start_date','end_date')))

        if len(campaign_df) == 0:
            ## no campaigns match the datapoitns so update all with cj_id = -2
            DataPoint.objects.filter(id__in=new_dp_df['id'].unique())\
                .update(cache_job_id = -2)
            return

        dp_merged_df = new_dp_df.merge(office_lookup_df)
        cleaned_dp_df = dp_merged_df[['id','office_id','data_date']]

        dp_ids_that_need_campaign = []
        dp_merged_with_campaign = cleaned_dp_df.merge(campaign_df)

        ## iterrate over the dps and check if there is a campaign ##
        for ix, r in dp_merged_with_campaign.iterrows():
            ## convert date time to date
            r_date = r.data_date.date()
            if r_date >= r.end_date or r_date < r.start_date:
                dp_ids_that_need_campaign.append(r.id)

        DataPoint.objects.filter(id__in=dp_ids_that_need_campaign)\
            .update(cache_job_id = -2)
Example #27
0
def dfm_A_minus_B(A:DataFrame,B:DataFrame, key_cols:list)->DataFrame:
    """
    A - B return the entries which in A but not in B based on key cols, it is mainly used to remove duplicate entries and
    then insert to DB
    :param A:
    :param B:
    :return:
    """
    if len(B) == 0:
        return A
    # dfmprint(A[0:10])
    # dfmprint(B[0:10])
    B_tmp=B[key_cols].copy()
    B_tmp['tmp_col_duplicated'] = 'Y'
    dfm_merge_by_keycols = A.merge(B_tmp, how='left', on = key_cols)
    dfm_merge_by_keycols.fillna({'tmp_col_duplicated':'N'},inplace = True)
    dfm_dif_by_keycols = dfm_merge_by_keycols[dfm_merge_by_keycols.tmp_col_duplicated.isin(['N'])]
    del dfm_dif_by_keycols['tmp_col_duplicated']
    return dfm_dif_by_keycols
Example #28
0
class CubeJoin(object):
    def __init__(self, cube):
        self.cube = cube
        self.data = DataFrame({})

        MyClient = riak.RiakClient(
            protocol=conf("riak")["protocol"],
            http_port=conf("riak")["http_port"],
            host=conf("riak")["host"])

        self.MyBucket = MyClient.bucket(conf("riak")["bucket"])
        self.MyBucket.enable_search()

        method = getattr(self, cube.get('cube_join_type', 'none'))
        method()

    def inner(self):
        fields = set([rel['field'] for rel in self.cube.get('relationship')])
        self.data = concat([DataFrame(self.MyBucket.get(rel['cube']).data)
                            for rel in self.cube.get('relationship')],
                           keys=fields, join='inner', ignore_index=True,
                           axis=1)
        return self.data

    def left(self):
        fields = [rel['field'] for rel in self.cube.get('relationship')]
        self.data = DataFrame({fields[0]: []})
        for rel in self.cube.get('relationship'):
            self.data = self.data.merge(DataFrame(
                self.MyBucket.get(rel['cube']).data),
                how='outer', on=fields[0])
        return self.data

    def append(self):
        self.data = DataFrame({})
        self.data.append([DataFrame(self.MyBucket.get(rel['cube']).data)
                          for rel in self.cube.get('relationship')],
                         ignore_index=True)
        return self.data

    def none(self):
        return self.data
Example #29
0
    def base_transform(self):
        results = []

        df_columns = ['id', 'indicator_id', 'campaign_id', 'location_id',\
            'value']
        computed_datapoints = DataPointComputed.objects.filter(
                campaign__in=self.parsed_params['campaign__in'],
                location__in=self.location_ids,
                indicator__in=self.parsed_params['indicator__in'])

        dwc_df = DataFrame(list(computed_datapoints.values_list(*df_columns)),\
            columns=df_columns)

        # do an inner join on the filter indicator
        if self.parsed_params['filter_indicator'] and self.parsed_params['filter_value']:
            merge_columns = ['campaign_id', 'location_id']
            indicator_id = Indicator.objects.get(short_name = self.parsed_params['filter_indicator'])
            filter_value_list = [self.parsed_params['filter_value']]

            if filter_value_list == ['-1']: ## this means "show all classes"
                filter_value_list = [1,2,3]
                ## this only works for LPDS... this should be --
                ## IndicatorClassMap.objects.filter(indicator = indicator)\
                ##    .values_list(enum_value, flat = True)

            filter_datapoints = DataPointComputed.objects.filter(
                campaign__in=self.parsed_params['campaign__in'],
                location__in=self.location_ids,
                indicator_id=indicator_id,
                value__in = filter_value_list
                )
            filter_df =DataFrame(list(filter_datapoints.values_list(*merge_columns)),\
            columns=merge_columns)
            dwc_df = dwc_df.merge(filter_df, how='inner', on=merge_columns)

            ## now only show the locations that match that filter..
            location_ids_in_filter = set(filter_df['location_id'])
            self.location_ids = set(self.location_ids)\
                .intersection(location_ids_in_filter)

        dwc_df = dwc_df.apply(self.add_class_indicator_val, axis=1)
        return dwc_df
Example #30
0
    def normalize_by_pathway(self, pathway_feature, level=3): 
        """
            pathway_feature can be n_protein or sequence_lenght
        """
        self.set_pathway_info()

        pathway_factor= {c.name: c.db_data[pathway_feature] for c in self.ideograms[level-1].chromosomes}
        nLevels= self.biodb_selector.getLevelCount()
        
        df_lengths= DataFrame(pathway_factor.items(), index= range(len(pathway_factor)), columns=["Level %s" % level,"Length"])
        #return df_lengths
        ## by merging acording to the lengths dataframe, we also 
        ## filter the main dataframe in the meanwhile
        
        df_merged= df_lengths.merge(self.data_frame, on= "Level %s" % level)
        ### columns have one extra level for accession and one more
        ### for the lengths in df_merged
        
        df_merged[self.data_frame.columns[nLevels+1:]]=  df_merged[self.data_frame.columns[nLevels+1:]].divide(df_merged['Length'].values, axis= 0)
        
        df_normalized= df_merged[self.data_frame.columns]       
        self.update_ideograms_by_dataframe(df_normalized)
        self.update_dataframe()