Example #1
0
 def range(cls, columns, dimension):
     column = columns.data[columns.get_dimension(dimension).name]
     if column.dtype.kind == 'O':
         column = np.sort(column[column.notnull()].compute())
         return column[0], column[-1]
     else:
         return dd.compute(column.min(), column.max())
Example #2
0
 def range(cls, dataset, dimension):
     import dask.dataframe as dd
     column = dataset.data[dataset.get_dimension(dimension).name]
     if column.dtype.kind == 'O':
         column = np.sort(column[column.notnull()].compute())
         return (column[0], column[-1]) if len(column) else (None, None)
     else:
         return dd.compute(column.min(), column.max())
Example #3
0
 def range(cls, dataset, dimension):
     import dask.dataframe as dd
     column = dataset.data[dataset.get_dimension(dimension).name]
     if column.dtype.kind == 'O':
         column = np.sort(column[column.notnull()].compute())
         return (column[0], column[-1]) if len(column) else (None, None)
     else:
         return dd.compute(column.min(), column.max())
Example #4
0
def complicated_arithmetic_operation(df):
    theta_1 = df.pickup_longitude
    phi_1 = df.pickup_latitude
    theta_2 = df.dropoff_longitude
    phi_2 = df.dropoff_latitude
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return dd.compute(ret)
def test_consistency_interactions_episode_numbers(dataset: ContentWiseImpressions):
    na_episode_number_mask: ddf.Series = dataset.interactions.episode_number.isna()
    invalid_episode_number_mask: ddf.Series = (dataset.interactions.episode_number < 0)

    (na_episode_number_mask,
     invalid_episode_number_mask,) = ddf.compute(na_episode_number_mask,
                                                 invalid_episode_number_mask,)

    assert not na_episode_number_mask.any()
    assert not invalid_episode_number_mask.any()
def test_consistency_impressions_non_direct_link_recommended_lists_with_at_least_one_item(dataset: ContentWiseImpressions):
    empty_recommendation_list_mask = (dataset
                                      .impressions_non_direct_link
                                      .recommended_series_list
                                      .map(lambda recommended_series_list: recommended_series_list.shape[0] == 0,
                                           meta=("empty_recommendation_list_mask", "bool")))

    (empty_recommendation_list_mask,) = ddf.compute(empty_recommendation_list_mask,)

    assert not empty_recommendation_list_mask.any(skipna=False)
def test_consistency_impressions_non_direct_link_recommended_series(dataset: ContentWiseImpressions):
    na_recommended_series_map_mask: ddf.Series = (dataset
                                                  .impressions_non_direct_link
                                                  .recommended_series_list
                                                  .map(lambda recommended_series_list: np.any(np.isnan(recommended_series_list)),
                                                       meta=("na_recommended_series_mask", "bool")))

    (na_recommended_series_map_mask,) = ddf.compute(na_recommended_series_map_mask)

    assert not na_recommended_series_map_mask.any(skipna=False)
def test_consistency_impressions_non_direct_link_row_position(dataset: ContentWiseImpressions):
    na_row_position_mask: ddf.Series = dataset.impressions_non_direct_link.row_position.isna()
    row_position_less_than_zero_mask: ddf.Series = (dataset.impressions_direct_link.row_position < 0)

    (na_row_position_mask,
     row_position_less_than_zero_mask,) = ddf.compute(na_row_position_mask,
                                                      row_position_less_than_zero_mask, )

    assert not na_row_position_mask.any(skipna=False)
    assert not row_position_less_than_zero_mask.any(skipna=False)
def test_consistency_interactions_series_length(dataset: ContentWiseImpressions):
    na_series_length_mask: ddf.Series = dataset.interactions.series_length.isna()
    invalid_series_length_mask: ddf.Series = (dataset.interactions.series_length < 0)

    (na_series_length_mask,
     invalid_series_length_mask,) = ddf.compute(na_series_length_mask,
                                                invalid_series_length_mask,)

    assert not na_series_length_mask.any()
    assert not invalid_series_length_mask.any()
Example #10
0
    def format_source_data(self, data):
        """
        Description:
            format source
        -------------------------------------------
        Input:
        source_dict = {
            'X': [],
            'Y': []
        }
        -------------------------------------------

        Ouput:
        """
        self.source = data
        self.x_range = (self.source[self.x].min(), self.source[self.x].max())
        self.y_range = (self.source[self.y].min(), self.source[self.y].max())
        if isinstance(data, dask_cudf.core.DataFrame):
            self.x_range = dd.compute(*self.x_range)
            self.y_range = dd.compute(*self.y_range)
def test_consistency_impressions_direct_link_recommendation_list_length(dataset: ContentWiseImpressions):
    na_recommendation_list_length_mask: ddf.Series = dataset.impressions_direct_link.recommendation_list_length.isna()
    recommendation_list_length_less_than_zero_mask: ddf.Series = (
            dataset.impressions_direct_link.recommendation_list_length < 0)

    (na_recommendation_list_length_mask,
     recommendation_list_length_less_than_zero_mask,) = ddf.compute(na_recommendation_list_length_mask,
                                                                    recommendation_list_length_less_than_zero_mask,)

    assert not na_recommendation_list_length_mask.any(skipna=False)
    assert not recommendation_list_length_less_than_zero_mask.any(skipna=False)
 def range(cls, dataset, dimension):
     import dask.dataframe as dd
     dimension = dataset.get_dimension(dimension, strict=True)
     column = dataset.data[dimension.name]
     if column.dtype.kind == 'O':
         column = np.sort(column[column.notnull()].compute())
         return (column[0], column[-1]) if len(column) else (None, None)
     else:
         if dimension.nodata is not None:
             column = cls.replace_value(column, dimension.nodata)
         return dd.compute(column.min(), column.max())
def test_consistency_interactions_index(dataset: ContentWiseImpressions):
    na_index_mask: ddf.Series = dataset.interactions.index.isna()
    min_index: int = dataset.interactions.index.min()
    max_index: int = dataset.interactions.index.max()

    (na_index_mask,
     min_index,
     max_index,) = ddf.compute(na_index_mask,
                               min_index,
                               max_index,)

    assert not na_index_mask.any()
def test_consistency_interactions_items_have_same_series_length(dataset: ContentWiseImpressions):
    pairs_item_id_with_series_length = (dataset
                                        .interactions[["item_id", "series_length"]]
                                        .groupby("item_id")
                                        .series_length
                                        .agg(["min", "max"]))

    invalid_pairs_mask = (pairs_item_id_with_series_length["min"] != pairs_item_id_with_series_length["max"])

    (invalid_pairs_mask,) = ddf.compute(invalid_pairs_mask, scheduler="threads")

    assert not invalid_pairs_mask.any()
def test_consistency_interactions_items_have_only_one_episode_number(dataset: ContentWiseImpressions):
    pairs_item_id_with_episode_number = (dataset
                                         .interactions[["item_id", "episode_number"]]
                                         .groupby("item_id")
                                         .episode_number
                                         .agg(["min", "max"]))

    invalid_pairs_mask = (pairs_item_id_with_episode_number["min"] != pairs_item_id_with_episode_number["max"])

    (invalid_pairs_mask, ) = ddf.compute(invalid_pairs_mask, scheduler="threads")

    assert not invalid_pairs_mask.any()
def test_consistency_interactions_item_types(dataset: ContentWiseImpressions):
    na_item_type_mask: ddf.Series = dataset.interactions.item_type.isna()
    invalid_item_types_mask: ddf.Series = (dataset.interactions
                                           .item_type
                                           .map(lambda item_type: item_type not in {0, 1, 2, 3}))

    (na_item_type_mask,
     invalid_item_types_mask,) = ddf.compute(na_item_type_mask,
                                             invalid_item_types_mask,)

    assert not na_item_type_mask.any()
    assert not invalid_item_types_mask.any()
def test_consistency_interactions_series_ids(dataset: ContentWiseImpressions):
    expected_number_series: int = dataset.metadata["num_series"]
    na_series_id_mask: ddf.Series = dataset.interactions.series_id.isna()
    invalid_series_id_mask: ddf.Series = ((dataset.interactions.series_id < 0) |
                                          (dataset.interactions.series_id > expected_number_series))

    (na_series_id_mask,
     invalid_series_id_mask,) = ddf.compute(na_series_id_mask,
                                            invalid_series_id_mask,)

    assert not na_series_id_mask.any()
    assert not invalid_series_id_mask.any()
def test_consistency_impressions_non_direct_link_reported_length_equal_to_actual_length(dataset: ContentWiseImpressions):
    recommendation_list_length = dataset.impressions_non_direct_link.recommendation_list_length

    actual_length_of_recommended_series = (dataset
                                           .impressions_non_direct_link
                                           .recommended_series_list
                                           .map(lambda series: series.shape[0],
                                                meta=("actual_length_of_recommended_series", "int")))

    impressions_with_mismatching_length_mask = (recommendation_list_length != actual_length_of_recommended_series)

    (impressions_with_mismatching_length_mask,) = ddf.compute(impressions_with_mismatching_length_mask)

    assert not impressions_with_mismatching_length_mask.any(skipna=False)
Example #19
0
    def create_posjac(self):
        '''
        Replace our sparse jacobian with a positive variation (negative links are reversed)
        Self reactions are removed and non existant species are removed.
        
        Args:
            ignore - list of species to be ignored in posjac array (most commonly inorganics)
        '''
        print ('computing the posjac array')

        try:
            self.posjac
            print ('Posjac already exists, use "del <name>.posjac" to remove it')
             
        except:None

        #remove no existant species
        #rm = re.compile(r'\b%s\b'%'|'.join(set('->'.join(self.jacsp.columns).split('->'))-set(self.spec.columns)))
        #self.posjac = self.jacsp[filter(lambda x: not rm.search(x), self.jacsp.columns)]

        #self reactions and negatives
        
        
        contains = set(self.jacsp.columns)
        selfself = set(('%s->%s'%(i,i) for i in self.spec.columns))

        rxns = list(set(self.jacsp.columns) - selfself)
        
        self.posjac = dd.compute(self.jacsp[rxns])[0]

        rev = re.compile(r'(.+)->(.+)')
        #for each negative reaction
        for h in rxns:
            #our column 
            dummy = self.posjac[h]
            #save static positive values - unchanged
            self.posjac[h] = dummy*(dummy>0).astype(float)
            
            #negative (reverse ) values only
            lt = dummy<0
            mx = np.array(dummy*(-lt.astype(float)))
            
            #reverse link
            hp = rev.sub(r'\2->\1',h)
            try:self.posjac[hp] = self.posjac[hp] + mx
            except:self.posjac[hp] = mx

        #remove emptys
        self.posjac = self.posjac[self.posjac.columns[(self.posjac!=0).sum().astype(bool)]]
 def normalize(self, df):
     try:
         min_date, max_date = dd.compute(df.block_timestamp.min(),
                                         df.block_timestamp.max())
         self.day_diff = abs((max_date - min_date).days)
         logger.error("NORMALIZATION started for day-diff:%s day(s)",self.day_diff)
         if self.day_diff > 0:
             for col in df.columns:
                 if isinstance(col,int) or isinstance(col,float):
                     logger.warning("NORMALATION ONGOING FOR %s",col)
                     df[col] = df[col].map(self.divide_by_day_diff)
         logger.warning("NORMALIZATION ended for day-diff:%s days",self.day_diff)
         return df
     except Exception:
         logger.error('nomalize:',exc_info=True)
def test_consistency_interactions_impressions_direct_link_only_common_recommendation_ids(dataset: ContentWiseImpressions):
    unique_shared_recommendation_ids = (dataset
                                        .interactions
                                        .merge(right=dataset.impressions_direct_link,
                                               how="inner",
                                               left_on="recommendation_id",
                                               right_index=True)
                                        .recommendation_id
                                        .unique())

    # We add the missing recommendation id (-1) as part of a different recommendation id. The merge above removes this
    # value, we add its count here.
    num_unique_shared_recommendation_ids = unique_shared_recommendation_ids.shape[0] + 1

    (num_unique_shared_recommendation_ids,) = ddf.compute(num_unique_shared_recommendation_ids)

    assert num_unique_shared_recommendation_ids == dataset.metadata["num_recommendations"]
    def get_albedo(self, datapaths, albedo_type): #load albedos and extract data

        DS = xr.open_mfdataset(datapaths[:], parallel=True, chunks='auto') #loading ncdf files
    
        ## Get QFLAG
        ## Give dtype here because lazy loading can't infer it (float by defaut but right_shift requires int)
        da_qflag = DS['QFLAG'].astype(np.uint8)
        #da_snowmask = np.logical_and(np.right_shift(da_qflag, 5), 1)==1 # True if snow, False otherwise
        da_snowmask = (da_qflag & 32)==32 # True if snow, False otherwise

        ## Get albedo data
        da_al = DS[albedo_type] #getting data for specific band
        if self.mode=='nosnow':
            da_al = da_al.where(~da_snowmask) # filter out snow: set to nan when da_snowmask is False

        elif self.mode=='snowmask':
            da_al = da_snowmask
        
        #da_mean_lowres = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).mean('time') #downsampling for faster plotting
        
        #da_mean_lowres = da_al.isel(lon=slice(5400, 6100), lat=slice(2100,2700)).mean('time') # center of Africa

        #da_mean_lowres = da_al.isel(lon=slice(6000, 9000), lat=slice(None,2000)).mean('time') # Asia
        da_mean_lowres = da_al.isel(lat=slice(None,1600)).mean('time') # High latitudes
        #da_mean_lowres = da_al.isel(lon=slice(22400, 33600, 10), lat=slice(None,7467, 10)).mean('time') # Asia for 1KM
        #da_mean_lowres = da_al.isel(lon=slice(7000, 9000), lat=slice(700,1700)).mean('time') # himalaya
        #da_mean_lowres = da_al.isel(lon=slice(7500, 8300), lat=slice(1200,1600)).mean('time') # himalaya zoom
        #da_mean_lowres = da_al.isel(lon=slice(7640, 7760), lat=slice(1300,1360)).mean('time') # himalaya big zoom

        # da_mean_lowres = da_al.mean('time') # Full res
    
        if self.mode=='snowmask':
            da_mean_lowres = da_mean_lowres.where(da_mean_lowres>1.e-6)

        #getting average, min and max albedos for each time step (used to plot timeline)
        if 0:
            da_timeline_mean = da_al.mean(['lon','lat'])
            da_timeline_max  = da_al.max(['lon','lat'])
            da_timeline_min  = da_al.min(['lon','lat'])
        da_timeline_mean = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).mean(['lon','lat'])
        da_timeline_max  = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).max(['lon','lat'])
        da_timeline_min  = da_al.sel(lat=slice(None, None, 50), lon=slice(None, None, 50)).min(['lon','lat'])
    
        res_comp = dd.compute(da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min) 

        return res_comp
Example #23
0
def main():
    input_path = "/Users/Jason/Downloads/tempdata/Run06136_r0.tio"
    pedestal_path = "/Users/Jason/Downloads/tempdata/Run06136_ped.tcal"
    max_events = None

    reader = TIOReader(input_path, max_events=max_events)
    wf_calib = WaveformCalibrator(
        pedestal_path, reader.n_pixels, reader.n_samples
    )

    dtio = DaskTIO(reader, wf_calib)
    ddf = dtio.get_file_df()
    # print("here")
    df_0, df_2 = dd.compute(
        ddf.groupby(['ipix', 'fblock', 'fbpisam'])['r0'].std(),
        ddf.groupby(['ipix', 'fci', 'fbpisam'])['r0'].std(),
    )
    embed()
def test_consistency_interactions_vision_factor(dataset: ContentWiseImpressions):
    # Vision factor values should only be set when the interaction type is "Viewed" (0).
    # We verify that all "viewed" interactions have valid values (from 0.0 to 1.0)
    # For any other interaction, we verify that the value is -1.
    na_vision_factor_mask: ddf.Series = dataset.interactions.vision_factor.isna()
    viewed_interactions_vision_factors = dataset.interactions[dataset.interactions.interaction_type == 0].vision_factor
    viewed_invalid_vision_factor_mask: ddf.Series = ((viewed_interactions_vision_factors < 0) |
                                                     (viewed_interactions_vision_factors > 5.0))

    other_interactions_vision_factors = dataset.interactions[dataset.interactions.interaction_type != 0].vision_factor
    other_invalid_vision_factor_mask: ddf.Series = (other_interactions_vision_factors != -1.0)

    (na_vision_factor_mask,
     viewed_invalid_vision_factor_mask,
     other_invalid_vision_factor_mask,) = ddf.compute(na_vision_factor_mask,
                                                      viewed_invalid_vision_factor_mask,
                                                      other_invalid_vision_factor_mask,)

    assert not na_vision_factor_mask.any()
    assert not viewed_invalid_vision_factor_mask.any()
    assert not other_invalid_vision_factor_mask.any()
Example #25
0
def min_max_count(x, column=0):
    """ min_max_count

    Handles min, max and count. This works on numpy, lists, pandas and dask dataframes.

    :param column:
    :param x: list, numpy array, series, pandas or dask dataframe
    :return: min, max and count
    """
    if dd and type(x) in (dd.core.DataFrame, dd.core.Series):
        omin, omax, count = dd.compute(x.min(), x.max(), x.count())
    elif type(x) in (pd.DataFrame, pd.Series):
        omin = x.min()
        omax = x.max()
        count = len(x)
    else:
        omin = min(x)
        omax = max(x)
        count = len(x)

    return omin, omax, int(count)
Example #26
0
def missing_spectrum(df: dd.DataFrame, num_bins: int,
                     num_cols: int) -> Intermediate:
    """
    Calculate a missing spectrum for each column
    """
    # pylint: disable=too-many-locals
    num_bins = min(num_bins, len(df) - 1)

    df = df.iloc[:, :num_cols]
    cols = df.columns[:num_cols]
    ncols = len(cols)
    nrows = len(df)
    chunk_size = len(df) // num_bins

    data = df.isnull().to_dask_array()
    data.compute_chunk_sizes()
    data = data.rechunk((chunk_size, None))

    (notnull_counts, ) = dd.compute(data.sum(axis=0) / data.shape[0])
    missing_percent = {
        col: notnull_counts[idx]
        for idx, col in enumerate(cols)
    }

    missing_percs = data.map_blocks(missing_perc_blockwise,
                                    dtype=float).compute()
    locs0 = np.arange(len(missing_percs)) * chunk_size
    locs1 = np.minimum(locs0 + chunk_size, nrows)
    locs_middle = locs0 + chunk_size / 2

    df = pd.DataFrame({
        "column": np.repeat(cols.values, len(missing_percs)),
        "location": np.tile(locs_middle, ncols),
        "missing_rate": missing_percs.T.ravel(),
        "loc_start": np.tile(locs0, ncols),
        "loc_end": np.tile(locs1, ncols),
    })
    return Intermediate(data=df,
                        missing_percent=missing_percent,
                        visual_type="missing_spectrum")
Example #27
0
def measureValue(sessionStoreData,relayoutData,selectedData,measuresValue,fixFilterValue,clearFiltersButton):
    min = inventory.begin.min()
    
    max = inventory.end.max()
    startValue = 1970

    ctx = dash.callback_context

    if ctx.triggered[0]['prop_id'].split('.')[0] == 'measures' and len(ctx.triggered)>1:
        value = [startValue,max]
    elif ctx.triggered[0]['prop_id'].split('.')[0] == 'clearFiltersButton':
        value = [startValue,max]
    elif fixFilterValue == 'Time':
        raise PreventUpdate
    elif relayoutData.get('dragmode') == 'lasso' and selectedData is None:
        raise PreventUpdate

    else:
    

        df = inventory

        dd = filter_by_mapbox_data(df,relayoutData,selectedData)
        df = dd.compute()

        value = [df.begin.min(),df.end.max()]

    markColor = '#EBEBEB'
    marks = {}
    for year in range(min,max,20):
        marks.update({year:{'label':str(year),'style':{'color':markColor}}})
    marks.update({max:{'label':str(max),'style':{'color':markColor}}})
  
    sliderDict = {'min':min,'max':max,'value':value,'marks':marks}

    setRedis('sliderValue',sliderDict,sessionStoreData)  

    return 'Computed'
Example #28
0
    def create_posjac(self):
        '''
        Replace our sparse jacobian with a positive variation (negative links are reversed)
        Self reactions are removed and non existant species are removed.
        '''
        print('computing the posjac array')

        try:
            return self.posjac
        except:
            None

        #remove no existant species
        #rm = re.compile(r'\b%s\b'%'|'.join(set('->'.join(self.jacsp.columns).split('->'))-set(self.spec.columns)))
        #self.posjac = self.jacsp[filter(lambda x: not rm.search(x), self.jacsp.columns)]

        #self reactions and negatives
        contains = set(self.jacsp.columns[(self.jacsp < 0).sum().astype(bool)])
        selfself = set(('%s->%s' % (i, i) for i in self.spec.columns))
        self.posjac = dd.compute(
            self.jacsp[list(set(self.jacsp.columns) - selfself)])[0]

        rev = re.compile(r'(.+)->(.+)')
        #for each nevative reaction
        for h in contains - selfself:
            dummy = self.posjac[h]
            lt = dummy < 0
            mx = np.array(dummy * (-lt))
            self.posjac[h] = dummy * (dummy > 0)

            hp = rev.sub(r'\2->\1', h)
            try:
                self.posjac[hp] = self.posjac[hp] + mx
            except:
                self.posjac[hp] = mx

        #remove emptys
        self.posjac = self.posjac[self.posjac.columns[(self.posjac > 0).any()]]
def test_consistency_interactions_impressions_direct_link_interacted_items_are_inside_recommendation_list(dataset: ContentWiseImpressions):
    def get_series_index_on_recommendation_list(row) -> int:
        results: np.ndarray = np.where(row.recommended_series_list == row.series_id)
        indices: np.ndarray = results[0]

        if len(indices) == 0:
            return -1
        return indices[0]

    dataset: ddf.DataFrame = dataset.interactions.merge(right=dataset.impressions_direct_link,
                                                        how="inner",
                                                        left_on="recommendation_id",
                                                        right_index=True)

    dataset["recommendation_index"] = dataset.apply(get_series_index_on_recommendation_list,
                                                    axis="columns",
                                                    meta=("recommendation_index", "int32"))

    series_not_found_on_recommendation_mask: ddf.Series = (dataset.recommendation_index == -1)

    (series_not_found_on_recommendation_mask,) = ddf.compute(series_not_found_on_recommendation_mask)

    assert not series_not_found_on_recommendation_mask.any(skipna=False)
Example #30
0
    def transform(self, input_scores, calibrated_scores):
        """
        Calibrates a score

        Parameters
        ----------

           input_scores: list
              Input score files to be calibrated

           calibrated_files: list
              Output score files

        """

        assert isinstance(input_scores, list) or isinstance(
            input_scores, tuple)
        assert isinstance(calibrated_scores, list) or isinstance(
            calibrated_scores, tuple)
        assert len(calibrated_scores) == len(input_scores)
        for file_name, output_file_name in zip(input_scores,
                                               calibrated_scores):
            # Fetching scores
            dataframe = dask.dataframe.read_csv(file_name)
            dataframe = dataframe.compute()
            X = dataframe["score"].to_numpy()

            calibrated_scores = np.vstack([
                fitter.predict_proba(X) for fitter in self._categorical_fitters
            ]).T
            calibrated_scores = self.reduction_function(calibrated_scores,
                                                        axis=1)
            dataframe["score"] = calibrated_scores

            dataframe.to_csv(output_file_name, index=False)

        return calibrated_scores
def test_consistency_interactions_explicit_ratings(dataset: ContentWiseImpressions):
    # Explicit ratings values should only be set when the interaction type is "Rated" (2).
    # We verify that all "rated" interactions have valid values (from 0.0 to 5.0 with steps of 0.5)
    # For any other interaction, we verify that the value is -1.
    na_explicit_ratings_mask: ddf.Series = dataset.interactions.explicit_rating.isna()

    rated_interactions_explicit_ratings: ddf.Series = dataset.interactions[dataset.interactions.interaction_type ==
                                                                           2].explicit_rating
    rated_invalid_explicit_ratings_mask: ddf.Series = (rated_interactions_explicit_ratings
                                                       .map(lambda rating: rating not in np.linspace(0.0, 5.0,
                                                                                                     num=11)))

    other_interactions_explicit_rating = dataset.interactions[
        dataset.interactions.interaction_type != 2].explicit_rating
    other_invalid_explicit_ratings_mask: ddf.Series = (other_interactions_explicit_rating != -1.0)

    (na_explicit_ratings_mask,
     rated_invalid_explicit_ratings_mask,
     other_invalid_explicit_ratings_mask,) = ddf.compute(na_explicit_ratings_mask,
                                                         rated_invalid_explicit_ratings_mask,
                                                         other_invalid_explicit_ratings_mask,)
    assert not na_explicit_ratings_mask.any()
    assert not rated_invalid_explicit_ratings_mask.any()
    assert not other_invalid_explicit_ratings_mask.any()
def test_consistency_interactions_impressions_non_direct_link_only_common_user_ids(dataset: ContentWiseImpressions):
    # NOTE: We calculate uniqueness of user_ids on the impressions_non_direct_link due to the high impact on memory
    # that the merges take if not done in this way.
    unique_user_ids_on_impressions_non_direct_link = (dataset
                                                     .impressions_non_direct_link
                                                     .reset_index(drop=False)
                                                     .user_id
                                                     .unique()
                                                     .to_frame(name='user_id'))

    unique_shared_user_ids = (dataset
                              .interactions
                              .merge(right=unique_user_ids_on_impressions_non_direct_link,
                                     how="inner",
                                     left_on="user_id",
                                     right_on="user_id")
                              .user_id
                              .unique())

    num_unique_shared_user_ids = unique_shared_user_ids.shape[0]

    (num_unique_shared_user_ids,) = ddf.compute(num_unique_shared_user_ids)

    assert num_unique_shared_user_ids == dataset.metadata["num_users"]
Example #33
0
def missing_impact_1v1(  # pylint: disable=too-many-locals
    df: dd.DataFrame,
    x: str,
    y: str,
    bins: int,
    ndist_sample: int,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    # pylint: disable=too-many-arguments
    """
    Calculate the distribution change on another column y when
    the missing values in x is dropped.
    """

    df0 = df[[x, y]]
    df1 = df.dropna(subset=[x])

    srs0, srs1 = df0[y], df1[y]
    minimum, maximum = srs0.min(), srs0.max()

    hists = [
        histogram(srs, dtype=dtype, bins=bins, return_edges=True)
        for srs in [srs0, srs1]
    ]
    hists = da.compute(*hists)

    meta = ColumnsMetadata()
    meta["y", "dtype"] = detect_dtype(df[y], dtype)

    if is_dtype(detect_dtype(df[y], dtype), Continuous()):
        dists = [rv_histogram((hist[0], hist[2]))
                 for hist in hists]  # type: ignore
        xs = np.linspace(minimum, maximum, ndist_sample)

        pdfs = [dist.pdf(xs) for dist in dists]
        cdfs = [dist.cdf(xs) for dist in dists]

        distdf = pd.DataFrame({
            "x": np.tile(xs, 2),
            "pdf": np.concatenate(pdfs),
            "cdf": np.concatenate(cdfs),
            "label": np.repeat(LABELS, ndist_sample),
        })

        counts, xs, edges = zip(*hists)

        lower_bounds: List[float] = []
        upper_bounds: List[float] = []

        for edge in edges:
            lower_bounds.extend(edge[:-1])
            upper_bounds.extend(edge[1:])

        histdf = pd.DataFrame({
            "x":
            np.concatenate(xs),
            "count":
            np.concatenate(counts),
            "label":
            np.repeat(LABELS, [len(count) for count in counts]),
            "lower_bound":
            lower_bounds,
            "upper_bound":
            upper_bounds,
        })

        quantiles = [[srs.quantile(q) for q in [0, 0.25, 0.5, 0.75, 1]]
                     for srs in [srs0, srs1]]
        quantiles = dd.compute(*quantiles)

        boxdf = pd.DataFrame(quantiles)
        boxdf.columns = ["min", "q1", "q2", "q3", "max"]

        iqr = boxdf["q3"] - boxdf["q1"]
        boxdf["upper"] = np.minimum(boxdf["q3"] + 1.5 * iqr, boxdf["max"])
        boxdf["lower"] = np.maximum(boxdf["q3"] - 1.5 * iqr, boxdf["min"])
        boxdf["label"] = LABELS

        itmdt = Intermediate(
            dist=distdf,
            hist=histdf,
            box=boxdf,
            meta=meta["y"],
            x=x,
            y=y,
            visual_type="missing_impact_1v1",
        )
        return itmdt
    else:

        counts, xs = zip(*hists)

        df = pd.DataFrame({
            "x":
            np.concatenate(xs, axis=0),
            "count":
            np.concatenate(counts, axis=0),
            "label":
            np.repeat(LABELS, [len(count) for count in counts]),
        })

        # If the cardinality of a categorical column is too large,
        # we show the top `num_bins` values, sorted by their count before drop
        if len(counts[0]) > bins:
            sortidx = np.argsort(-counts[0])
            selected_xs = xs[0][sortidx[:bins]]
            df = df[df["x"].isin(selected_xs)]
            partial = (bins, len(counts[0]))
        else:
            partial = (len(counts[0]), len(counts[0]))

        meta["y", "partial"] = partial

        itmdt = Intermediate(
            hist=df,
            x=x,
            y=y,
            meta=meta["y"],
            visual_type="missing_impact_1v1",
        )
        return itmdt
Example #34
0
def missing_impact_1vn(  # pylint: disable=too-many-locals
    df: dd.DataFrame,
    x: str,
    bins: int,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    """
    Calculate the distribution change on other columns when
    the missing values in x is dropped.
    """
    df0 = df
    df1 = df.dropna(subset=[x])
    cols = [col for col in df.columns if col != x]

    hists = {}
    hists_restore_dtype = {}

    for col in cols:
        range = None  # pylint: disable=redefined-builtin
        if is_dtype(detect_dtype(df0[col], dtype), Continuous()):
            range = (df0[col].min(axis=0), df0[col].max(axis=0))

        hists[col] = [
            histogram(df[col],
                      dtype=dtype,
                      bins=bins,
                      return_edges=True,
                      range=range) for df in [df0, df1]
        ]

        # In some cases(Issue#98), dd.compute() can change the features dtypes and cause error.
        # So we need to restore features dtypes after dd.compute().
        centers_dtypes = (hists[col][0][1].dtype, hists[col][1][1].dtype)
        (hists, ) = dd.compute(hists)
        dict_value = []

        # Here we do not reassign to the "hists" variable as
        # dd.compute() can change variables' types and cause error to mypy test in CircleCI .
        # Instead, we assign to a new variable hists_restore_dtype.
        for i in [0, 1]:
            intermediate = list(hists[col][i])
            intermediate[1] = intermediate[1].astype(centers_dtypes[i])
            dict_value.append(tuple(intermediate))
        hists_restore_dtype[col] = dict_value

    dfs = {}

    meta = ColumnsMetadata()

    for col, hists_ in hists_restore_dtype.items():
        counts, xs, *edges = zip(*hists_)

        labels = np.repeat(LABELS, [len(x) for x in xs])

        data = {
            "x": np.concatenate(xs),
            "count": np.concatenate(counts),
            "label": labels,
        }

        if edges:
            lower_bound: List[float] = []
            upper_bound: List[float] = []

            for edge in edges[0]:
                lower_bound.extend(edge[:-1])
                upper_bound.extend(edge[1:])

            data["lower_bound"] = lower_bound
            data["upper_bound"] = upper_bound

        df = pd.DataFrame(data)

        # If the cardinality of a categorical column is too large,
        # we show the top `num_bins` values, sorted by their count before drop
        if len(counts[0]) > bins and is_dtype(detect_dtype(df0[col], dtype),
                                              Nominal()):
            sortidx = np.argsort(-counts[0])
            selected_xs = xs[0][sortidx[:bins]]
            df = df[df["x"].isin(selected_xs)]
            meta[col, "partial"] = (bins, len(counts[0]))
        else:
            meta[col, "partial"] = (len(counts[0]), len(counts[0]))
        meta[col, "dtype"] = detect_dtype(df0[col], dtype)
        dfs[col] = df

    return Intermediate(data=dfs,
                        x=x,
                        meta=meta,
                        visual_type="missing_impact_1vn")