def test_nonvectorized_math_apply_on_small_dataframe(self): LOG.info("test_nonvectorized_math_apply_on_small_dataframe") df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) tqdm.pandas(desc="Pandas Nonvec math apply ~ DF") pd_val = df.progress_apply(math_agg_foo) swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_agg_foo) self.assertEqual(pd_val, swifter_val) # equality test
def _dask_apply(self, func, *args, **kwds): try: # check that the dask rolling apply matches the pandas apply with suppress_stdout_stderr(): tmp_df = ( dd.from_pandas(self._sample_original, npartitions=self._npartitions) .rolling(**{k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"]}) .apply(func, *args, **kwds) .compute(scheduler=self._scheduler) ) self._validate_apply( tmp_df.equals(self._sample_pd.apply(func, *args, **kwds)), error_message="Dask rolling apply sample does not match pandas rolling apply sample.", ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) else: return self._obj_dd.apply(func, *args, **kwds).compute(scheduler=self._scheduler) except (AttributeError, ValueError, TypeError, KeyError): if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") return self._obj_pd.progress_apply(func, *args, **kwds) else: return self._obj_pd.apply(func, *args, **kwds)
def add_calendar_features(df, use_lunar=True, use_holiday=True): """ Thêm các feature giờ, thứ, tháng, ngày nghỉ lễ, lịch âm, v.v. vào dataframe """ df['Hour'] = df.index.to_series().dt.hour df['DayOfWeek'] = df.index.to_series().dt.dayofweek df['Month'] = df.index.to_series().dt.month df['DayOfYear'] = df.index.to_series().dt.dayofyear df['Weekend'] = df['DayOfWeek'].isin([5, 6]).astype(int) # Saturday or Sunday if use_lunar or use_holiday: tqdm.pandas() if use_lunar: lunar_features = df.index.to_series().progress_apply( get_lunar_calendar_features) df = df.merge(lunar_features, left_index=True, right_index=True) df['LeapMonth'] = df['LeapMonth'].astype(int) if use_holiday: df['Holiday'] = df.index.to_series().progress_apply( get_holiday) # Running this will take a lot of time df['IsHoliday'] = (df['Holiday'] != 'No').astype(int) return df
def apply(self, func, convert_dtype=True, args=(), **kwds): """ Apply the function to the Series using swifter """ # if the series is empty, return early using Pandas if not self._nrows: return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) sample = self._obj.iloc[:self._npartitions * 2] # check if input is string or if the user is overriding the string processing default allow_dask_processing = True if self._allow_dask_on_strings else ( sample.dtype != "object") if "axis" in kwds.keys(): kwds.pop("axis") warnings.warn( "Axis keyword not necessary because applying on a Series.") try: # try to vectorize with suppress_stdout_stderr_logging(): tmp_df = func(sample, *args, **kwds) sample_df = sample.apply(func, convert_dtype=convert_dtype, args=args, **kwds) self._validate_apply( np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape), error_message= "Vectorized function sample doesn't match pandas apply sample.", ) return func(self._obj, *args, **kwds) except ERRORS_TO_HANDLE: # if can't vectorize, estimate time to pandas apply wrapped = self._wrapped_apply(func, convert_dtype=convert_dtype, args=args, **kwds) timed = timeit.timeit(wrapped, number=N_REPEATS) sample_proc_est = timed / N_REPEATS est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[ 0] # if pandas sample apply takes too long and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing: return self._dask_apply(func, convert_dtype, *args, **kwds) else: # use pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") return self._obj.progress_apply( func, convert_dtype=convert_dtype, args=args, **kwds) else: return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds)
def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_values(['user_id', 'session_id', 'timestamp', 'step']).reset_index() # find the last clickout rows last_clickout_idxs = find_last_clickout_indices(df) clickout_rows = df.loc[ last_clickout_idxs, ['user_id', 'session_id', 'impressions', 'index']] clickout_rows[ 'impressions_count'] = clickout_rows.impressions.str.split( '|').str.len() clickout_rows = clickout_rows.drop('impressions', axis=1) # multi-hot the counts one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8) for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)): one_hot_counts[i, 0:c] = 1 # add to the clickouts for i in range(25): clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i] return clickout_rows.drop('impressions_count', axis=1).set_index('index')
def _dask_applymap(self, func): sample = self._obj.iloc[:self._npartitions * 2, :] with suppress_stdout_stderr_logging(): meta = sample.applymap(func) try: with suppress_stdout_stderr_logging(): # check that the dask apply matches the pandas apply tmp_df = (dd.from_pandas( sample, npartitions=self._npartitions).applymap( func, meta=meta).compute(scheduler=self._scheduler)) self._validate_apply( tmp_df.equals(meta), error_message= "Dask applymap sample does not match pandas applymap sample." ) if self._progress_bar: with TQDMDaskProgressBar( desc=self._progress_bar_desc or "Dask Applymap"): return (dd.from_pandas( self._obj, npartitions=self._npartitions).applymap( func, meta=meta).compute(scheduler=self._scheduler)) else: return (dd.from_pandas( self._obj, npartitions=self._npartitions).applymap( func, meta=meta).compute(scheduler=self._scheduler)) except ERRORS_TO_HANDLE: # if dask apply doesn't match pandas apply, fallback to pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") applymap_func = self._obj.progress_applymap else: applymap_func = self._obj.applymap return applymap_func(func)
def _dask_apply(self, func, *args, **kwds): try: # check that the dask rolling apply matches the pandas apply with suppress_stdout_stderr_logging(): tmp_df = (dd.from_pandas( self._comparison_pd, npartitions=self._npartitions).rolling( **{ k: v for k, v in self._rolling_kwds.items() if k not in ["on", "closed"] }).apply(func, *args, **kwds).compute(scheduler=self._scheduler)) self._validate_apply( tmp_df.equals( self._comparison_pd.rolling( **self._rolling_kwds).apply(func, *args, **kwds)), error_message=("Dask rolling apply sample does not match " "pandas rolling apply sample."), ) if self._progress_bar: with TQDMDaskProgressBar( desc=self._progress_bar_desc or "Dask Apply"): return self._obj_dd.apply( func, *args, **kwds).compute(scheduler=self._scheduler) else: return self._obj_dd.apply( func, *args, **kwds).compute(scheduler=self._scheduler) except ERRORS_TO_HANDLE: if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") return self._obj_pd.progress_apply(func, *args, **kwds) else: return self._obj_pd.apply(func, *args, **kwds)
def test_nonvectorized_math_apply_on_large_dataframe_broadcast(self): LOG.info("test_nonvectorized_math_apply_on_large_dataframe_broadcast") df = pd.DataFrame({ "x": np.random.normal(size=250_000), "y": np.random.uniform(size=250_000) }) tqdm.pandas(desc="Pandas Nonvec math apply + broadcast ~ DF") start_pd = time.time() pd_val = df.progress_apply(math_agg_foo, axis=1, result_type="broadcast") end_pd = time.time() pd_time = end_pd - start_pd start_swifter = time.time() swifter_val = (df.swifter.set_npartitions(4).progress_bar( desc="Nonvec math apply + broadcast ~ DF").apply( math_agg_foo, axis=1, result_type="broadcast")) end_swifter = time.time() swifter_time = end_swifter - start_swifter self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test self.assertLess(swifter_time, pd_time)
def extract_feature(self): tqdm.pandas() df = data.full_df() # find the clickout interactions res_df = df[['user_id','session_id','prices']] res_df = res_df[df.action_type == 'clickout item'] # expand the prices as vector expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int') # scale log log_prices = np.log(expanded_prices +1) max_price = max(np.max(log_prices)) min_price = min(np.min(log_prices)) log_prices = (log_prices - min_price) / (max_price - min_price) # add the prices to the resulting df for i in range(25): res_df['price_{}'.format(i)] = log_prices.loc[:, i] return res_df.drop(['user_id','session_id','prices'], axis=1)
def test_nonvectorized_text_modin_apply_on_large_dataframe(self): LOG.info("test_nonvectorized_text_modin_apply_on_large_dataframe") df = pd.DataFrame({ "letter": ["I", "You", "We"] * 1_000_000, "value": ["want to break free"] * 3_000_000 }) tqdm.pandas(desc="Pandas Nonvec text apply ~ DF") start_pd = time.time() pd_val = df.progress_apply(clean_text_foo, axis=1) end_pd = time.time() pd_time = end_pd - start_pd start_swifter = time.time() swifter_val = (df.swifter.allow_dask_on_strings(False).set_npartitions( 4).set_ray_compute(num_cpus=2 if self.ncores >= 2 else 1, memory=0.25).progress_bar( desc="Nonvec Modin text apply ~ DF").apply( clean_text_foo, axis=1)) end_swifter = time.time() swifter_time = end_swifter - start_swifter self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test self.assertLess(swifter_time, pd_time)
def test_nonvectorized_text_modin_apply_on_large_dataframe_returns_series( self): LOG.info( "test_nonvectorized_text_modin_apply_on_large_dataframe_returns_series" ) df = pd.DataFrame({"str_date": ["2000/01/01 00:00:00"] * 1_000_000}) tqdm.pandas(desc="Pandas Nonvec text apply ~ DF -> Srs") start_pd = time.time() pd_val = df.progress_apply(lambda row: row["str_date"].split()[0], axis=1) end_pd = time.time() pd_time = end_pd - start_pd start_swifter = time.time() swifter_val = (df.swifter.allow_dask_on_strings(False).set_npartitions( 4).set_ray_compute( num_cpus=2 if self.ncores >= 2 else 1, memory=0.25).progress_bar( desc="Nonvec Modin text apply ~ DF -> Srs").apply( lambda row: row["str_date"].split()[0], axis=1)) end_swifter = time.time() swifter_time = end_swifter - start_swifter self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test self.assertLess(swifter_time, pd_time)
def test_nonvectorized_math_apply_on_small_series(self): LOG.info("test_nonvectorized_math_apply_on_small_series") df = pd.DataFrame({"x": np.random.normal(size=1000)}) series = df["x"] tqdm.pandas(desc="Pandas Vec math apply ~ Series") pd_val = series.progress_apply(math_foo, compare_to=1) swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_foo, compare_to=1) self.assertEqual(pd_val, swifter_val) # equality test
def merge_speed_events(speed_df, events_df): tqdm.pandas() events_with_sensor_df = add_possible_sensors(events_df) #def in_range() events_with_sensor_df['sensors'] = events_with_sensor_df.progress_apply( \ lambda row: [x for x in row.ROAD_SENSORS if row.KM_START <= x <= row.KM_END], axis=1) events_with_sensor_df = events_with_sensor_df[ events_with_sensor_df['sensors'].str.len() > 0] return events_with_sensor_df.drop('ROAD_SENSORS', axis=1)
def clean_tweets(self, col_name): print('Cleaning tweets...') st = time() tqdm.pandas() self.df_[col_name] = self.df_.progress_apply( lambda row: self.process_tweet(row[col_name]), axis=1) end = time() print('Finished in {0:.2f} minutes.'.format((end - st) / 60)) return self
def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds): samp = self._obj.iloc[: self._npartitions * 2, :] meta = samp.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) try: if broadcast: result_type = "broadcast" elif reduce: result_type = "reduce" tmp_df = ( dd.from_pandas(samp, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) assert tmp_df.equals(meta) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) else: return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) except (AssertionError, AttributeError, ValueError, TypeError, KeyError): if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") return self._obj.progress_apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) else: return self._obj.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds )
def _modin_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds): sample = self._obj.iloc[:self._npartitions * 2, :] try: series = False with suppress_stdout_stderr_logging(): import modin.pandas as md sample_df = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) # check that the modin apply matches the pandas APPLY tmp_df = (md.DataFrame(sample).apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds)._to_pandas()) if isinstance(sample_df, pd.Series) and isinstance( tmp_df, pd.DataFrame): tmp_df = pd.Series(tmp_df.values[:, 0]) series = True self._validate_apply( tmp_df.equals(sample_df), error_message= "Modin apply sample does not match pandas apply sample.") output_df = (md.DataFrame(self._obj).apply(func, *args, axis=axis, raw=raw, result_type=result_type, **kwds)._to_pandas()) return pd.Series(output_df.values[:, 0]) if series else output_df except ERRORS_TO_HANDLE: if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") apply_func = self._obj.progress_apply else: apply_func = self._obj.apply return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds)
def value_overlap_matching(df, progress=True): """A schema matching method by calculating the similarities of link values. Args: df (pd.DataFrame): The dataframe where matching attributes are supposed to be found. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with "value_overlap". """ df = df.copy() # get column names, strip URIs from them & create a dictionary that maps between them old_colnames = [col for col in df.columns if re.findall("http:", col)] col_name_dict = {} for name in old_colnames: col_name_dict[re.sub(r"^.*http://", "http://", name)] = name new_colnames = [ re.sub(r"^.*http://", "http://", col) for col in old_colnames ] # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame combinations = list(itertools.combinations(new_colnames, 2)) combinations_sorted = [sorted(x) for x in combinations] # transform list into sorted DataFrame df_combinations = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) df_combinations.sort_values(by="uri_1") # For each combination in this DataFrame, calculate the similarity of their values if progress: tqdm.pandas(desc="Value Overlap Matching: Calculate Value Overlaps") df_combinations["value_overlap"] = df_combinations.progress_apply( lambda x: get_value_overlap(df, col_name_dict, x["uri_1"], x[ "uri_2"]), axis=1) else: df_combinations["value_overlap"] = df_combinations.apply( lambda x: get_value_overlap(df, col_name_dict, x["uri_1"], x[ "uri_2"]), axis=1) return df_combinations
def main(args): nl2bash = pd.read_json(args.nl2bash).T graph = defaultdict(lambda: set()) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) tqdm.pandas(desc="Extracting utilities graph") nl2bash['cmd'].progress_apply(partial(update_graph, graph=graph)) count_utilities = Counter() with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) tqdm.pandas(desc="Extracting utilities from examples") nl2bash['cmd'].progress_apply( partial(add_utilities, counter=count_utilities)) all_commands = list(bashlint.grammar.bg.grammar.keys()) count_utilities.update(all_commands) commands = pd.DataFrame.from_dict(count_utilities, orient='index', columns=["count"]).reset_index() \ .rename(columns={'index': 'cmd'}).sort_values('count').reset_index(drop=True) commands['required'] = commands['cmd'].apply(number_of_required_arguments) print(f"Found {len(commands)} total utilities") manpage = pd.read_json(args.manpage, lines=True) commands = commands.merge(manpage[['name', 'synopsis']], left_on='cmd', right_on='name', how='left') commands.loc[commands['synopsis'].isna(), 'synopsis'] = '' alias_to_idx = defaultdict(lambda: []) def get_aliases(x): idx = x.name for y in x['aliases']: y = y[0] alias_to_idx[y].append(idx) manpage.apply(get_aliases, axis=1) commands.drop_duplicates(inplace=True) print(f"Now {len(commands)} utilities") commands['options'] = commands['cmd'].apply( partial(get_options, manpage=manpage, alias_to_idx=alias_to_idx)) del manpage results = [] for t in tqdm(range(args.size), desc="Generating examples"): results.append(list(generate_commands(commands, graph))) results = pd.DataFrame(results, columns=["cmd", "query"]) results.to_csv(args.output, index=False)
def colocalize_apply(gdf1, gdf2, progress=False): """colocalize gdf1 and gdf2 return: 2 pandas Index idx1 and idx2, of the same size. idx1 are colocated index from gdf1 that colocalize with idx2 from gdf2 (note that index may not be unique if some are colocated more than once. """ if not sys.stderr.isatty() and "tqdm.std" in str(tqdm): progress = False def row_coloc(gdf_item,gdf2,gdf_geometry_name='geometry'): timeok_gdf2 = gdf2[gdf2_date_interval.overlaps(gdf_item.date_interval__)] if hasattr(gdf_item,gdf1_geometry_name) and hasattr(gdf2,'geometry'): intersect_gdf2_ok = timeok_gdf2.contains(getattr(gdf_item,gdf1_geometry_name)) | timeok_gdf2.intersects(getattr(gdf_item,gdf1_geometry_name)) | timeok_gdf2.geometry.within(getattr(gdf_item,gdf1_geometry_name)) else: # if the user gave no geometry : all index intersect_gdf2_ok = slice(None) intersect_gdf2_idx = timeok_gdf2[intersect_gdf2_ok].index return timeok_gdf2[intersect_gdf2_ok].index if not 'date_interval__' in gdf1: gdf1['date_interval__'] = pd.IntervalIndex.from_arrays(gdf1['startdate'],gdf1['stopdate']) else: drop1=False if 'date_interval__' in gdf2: gdf2_date_interval = pd.IntervalIndex(gdf2['date_interval__']) else: gdf2_date_interval = pd.IntervalIndex.from_arrays(gdf2['startdate'],gdf2['stopdate']) gdf1_geometry_name = gdf1.geometry.name gdf2_geometry_name = gdf2.geometry.name # empty index to store colocalization results idx1=gdf1.index.delete(slice(None)) idx2=gdf2.index.delete(slice(None)) if isinstance(gdf1.index,pd.MultiIndex): indexer1 = pd.MultiIndex.from_tuples else: indexer1 = pd.Index tqdm.pandas(disable = not progress, leave=False) gdf2_coloc_idx = gdf1.progress_apply(lambda row : row_coloc(row, gdf2,gdf_geometry_name=gdf1.geometry.name),axis=1) for gdf1_idx , gdf2_idx_serie in gdf2_coloc_idx.items(): for gdf2_idx in gdf2_idx_serie: idx1 = idx1.append(indexer1([gdf1_idx])) idx2 = idx2.append(indexer1([gdf2_idx])) return idx1,idx2
def handbuilt_featurizer(df_input): """ Return a dataframe with all the handbuilt features added :param df_input: pandas DataFrame, the input dataframe :return: pandas DataFrame, the output dataframe with all the handbuilt features added """ # add a progress bar wrapper around DataFrame.apply method tqdm.pandas(desc="Handbuilt Featurizer") df_handbuilt = df_input["structure_oxid"].progress_apply( handbuilt_featurizer_helper) df_with_handbuilt = pd.concat([df_input, df_handbuilt], axis=1) return df_with_handbuilt
def applymap(self, func): """ Applymap the function to the DataFrame using swifter """ # If there are no rows return early using Pandas if not self._nrows: return self._obj.applymap(func) sample = self._obj.iloc[:self._npartitions * 2, :] # check if input is string or if the user is overriding the string processing default allow_dask_processing = True if self._allow_dask_on_strings else ( "object" not in sample.dtypes.values) try: # try to vectorize with suppress_stdout_stderr(): tmp_df = func(sample) sample_df = sample.applymap(func) self._validate_apply( np.array_equal(sample_df, tmp_df) & (sample_df.shape == tmp_df.shape), error_message= "Vectorized function sample does not match pandas apply sample.", ) return func(self._obj) except ( AttributeError, ValueError, TypeError, TypingError, KeyError, ): # if can't vectorize, estimate time to pandas apply wrapped = self._wrapped_applymap(func) timed = timeit.timeit(wrapped, number=N_REPEATS) sample_proc_est = timed / N_REPEATS est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[ 0] # if pandas sample apply takes too long and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing: return self._dask_applymap(func) else: # use pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") applymap_func = self._obj.progress_applymap else: applymap_func = self._obj.applymap return applymap_func(func)
def extract_feature(self): tqdm.pandas() df = data.full_df() df = df.sort_index() # find the clickout rows clickout_rows = df[[ 'user_id', 'session_id', 'action_type', 'impressions' ]][df.action_type == 'clickout item'] clickout_rows[ 'impressions_count'] = clickout_rows.impressions.str.split( '|').str.len() # prepare the resulting dataframe res_df = df[['user_id', 'session_id']].copy() res_df['impressions_count'] = 0 # iterate over the sorted reference_rows and clickout_rows j = 0 clickout_indices = clickout_rows.index.values ck_idx = clickout_indices[0] next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id'] for idx, row in tqdm(res_df.iterrows()): # if the current index is over the last clickout, break if idx > clickout_indices[-1]: break # find the next clickout index while idx > clickout_indices[j]: j += 1 ck_idx = clickout_indices[j] next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id'] next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id'] # check if row and next_clickout are in the same session if row.user_id == next_clickout_user_id and row.session_id == next_clickout_sess_id: res_df.at[idx, 'impressions_count'] = clickout_rows.at[ ck_idx, 'impressions_count'] # create the 25 categories one_hot_counts = np.zeros((res_df.shape[0], 25), dtype=np.int8) for i, c in enumerate(res_df.impressions_count.values): one_hot_counts[i, 0:c] = 1 for i in range(25): res_df['impr_c{}'.format(i)] = one_hot_counts[:, i] return res_df.drop(['user_id', 'session_id', 'impressions_count'], axis=1)
def merge_duplicates(df): """ Deletes from df consecutive actions of same type performed on the same reference within the same session. It keeps the first occurrence of those consecutive actions and for those it saves how many consecutive actions are occurred in column 'frequence'. For the non-consecutive actions, frequence is set to 1. :param df: DataFrame to preprocess :return: df: preprocessed DataFrame df with 'frequence' column """ tqdm.pandas() duplicates_indices = [] # points to the next valid row indices = df.index.values totlen = len(df) i = 0 j = 0 next_index = indices[j] for index in tqdm(indices): if i >= j: curr_actiontype = df.at[index, 'action_type'] count = 1 j += 1 # check next interactions while j < totlen: next_index = indices[j] # iterate while the interactions are duplicated if curr_actiontype != 'clickout item' and \ df.at[index, 'user_id'] == df.at[next_index, 'user_id'] and \ df.at[index, 'session_id'] == df.at[next_index, 'session_id'] and \ df.at[index, 'reference'] == df.at[next_index, 'reference'] and \ curr_actiontype == df.at[next_index, 'action_type']: # current interaction can be merged j += 1 duplicates_indices.append(next_index) count += 1 else: break # different interaction reached df.at[index, 'frequence'] = count i += 1 # drop the duplicated indices return df.drop(duplicates_indices)
def make_history_df(self, type): '''Create dataframe with all players' gameweek or season histories''' print(f'Creating player {type} dataframe') tqdm.pandas() # get histories for each player df = pd.Series(self.players.index).progress_apply(get_player_history, type=type) # combine results into single dataframe df = pd.concat(p for p in df) # rename columns df.rename({'element': 'player_id'}, axis=1, inplace=True) return df
def load_concatenate_by_filename(needle: str, src_path="data/raw/pjud"): archivos = os.listdir(src_path) tqdm.pandas() dataframes = [] for archivo in archivos: if archivo.find(needle) != -1: df = pd.read_csv(f"{src_path}/{archivo}", sep=";", encoding='cp850', dtype='unicode', low_memory=True) dataframes.append(df) return pd.concat(dataframes)
def __init__(self, pattern, backup=None, targetcol='text', lang='english', cases='lower', hashtag=True, mention=True): import pandas as pd from tqdm.auto import tqdm tqdm.pandas() import nltk nltk.download('stopwords', quiet=True) from nltk.corpus import stopwords self.positive, self.negative = None, None self.target = targetcol self.cases = cases self.backup = backup self.pattern = pattern self.hashtag = hashtag self.mention = mention self.dtm = None self.stopword = stopwords.words(lang)
def extract_feature(self): tqdm.pandas() """ Train and test cannot be concatenated because there are some sessions that are splitted and they have a first part in train and the last in the test. In those cases, the label will be only one (since they will be treated as one session by 'find_last_clickout_indices' function) but they must have 2 different labels (1 for the train half, 1 for the test half) """ #df = pd.concat([data.train_df(self.mode), data.test_df(self.mode)]) def get_label(df): """ Return a dataframe with: index | user_id | session_id | label """ # find the last clickout rows idxs = find_last_clickout_indices(df) res_df = df[['user_id', 'session_id', 'reference', 'impressions']].loc[idxs] # remove the test sessions with reference NaN res_df = res_df.dropna(subset=['reference']).astype( {'reference': 'int'}) # create impressions list res_df['impressions_list'] = res_df['impressions'].str.split( '|').apply(lambda x: list(map(int, x))) res_df.drop('impressions', axis=1, inplace=True) label_series = np.zeros(res_df.shape[0], dtype='int8') # iterate over the rows k = 0 for row in tqdm( zip(res_df['reference'], res_df['impressions_list'])): ref = row[0] impress = row[1] if ref in impress: label_series[k] = impress.index(ref) k += 1 # add the new column res_df['label'] = label_series return res_df.drop(['reference', 'impressions_list'], axis=1) # compute the labels for train and test label_train = get_label(data.train_df(self.mode)) label_test = get_label(data.test_df(self.mode)) return pd.concat([label_train, label_test])
def extract_feature(self): tqdm.pandas() df = data.full_df() # count the popularity #cnt = Counter(df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)].reference.values.astype(int)) pop_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)] \ [['reference','frequence']].astype('int').groupby('reference').sum() cnt = pop_df.to_dict()['frequence'] # find the clickout rows clickout_rows = df[df.action_type == 'clickout item'][[ 'reference', 'impressions' ]] clickout_rows = clickout_rows.fillna(-1).astype({'reference': 'int'}) clickout_rows['impressions'] = clickout_rows.apply( lambda x: list(map(int, x.impressions.split('|'))), axis=1) # build the resulting matrix matrix = np.zeros((clickout_rows.shape[0], 25), dtype=int) i = 0 for impr in tqdm(clickout_rows.impressions): for j, impr in enumerate(impr): ## OLD version #popularity = cnt[impr] if impr in cnt else 0 #if popularity == row.reference: # popularity -= 1 ## NEW ! (decrease 1 to all references) popularity = cnt[impr] - 1 if impr in cnt else 0 matrix[i, j] = popularity i += 1 # scale log and min-max min_pop = np.log((pop_df['frequence'] - 1).clip(0).min() + 1) max_pop = np.log((pop_df['frequence'] - 1).clip(0).max() + 1) matrix = (np.log(matrix + 1) - min_pop) / (max_pop - min_pop) # add the columns to the resulting dataframe for i in range(25): clickout_rows['impr_pop{}'.format(i)] = matrix[:, i] return clickout_rows.drop(['reference', 'impressions'], axis=1)
def fill_skeleton(skeleton, sales): tqdm.pandas() def change_variable(row, sales, variable): e, w, sk = row['EAN'], row['Week'], row['StoreKey'] try: vol = sales[(e, w, sk)][variable] return vol except KeyError: return 0 for var in ['Volume', 'AvgPrice']: skeleton[var] = skeleton.progress_apply( lambda row: change_variable(row, sales, var), axis=1) return skeleton
def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds): sample = self._obj.iloc[: self._npartitions * 2, :] with suppress_stdout_stderr(): meta = sample.apply( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds ) try: if broadcast: result_type = "broadcast" elif reduce: result_type = "reduce" with suppress_stdout_stderr(): # check that the dask apply matches the pandas apply tmp_df = ( dd.from_pandas(sample, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) self._validate_apply( tmp_df.equals(meta), error_message="Dask apply sample does not match pandas apply sample." ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) else: return ( dd.from_pandas(self._obj, npartitions=self._npartitions) .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds) .compute(scheduler=self._scheduler) ) except (AttributeError, ValueError, TypeError, KeyError): # if dask apply doesn't match pandas apply, fallback to pandas if self._progress_bar: tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") apply_func = self._obj.progress_apply else: apply_func = self._obj.apply return apply_func( func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds )
# In[ ]: import time import random import pandas as pd import numpy as np import gc import re import torch from torchtext import data import spacy from tqdm import tqdm_notebook, tnrange from tqdm.auto import tqdm tqdm.pandas(desc='Progress') from collections import Counter from textblob import TextBlob from nltk import word_tokenize import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from torch.autograd import Variable from torchtext.data import Example from sklearn.metrics import f1_score import torchtext import os