class Iteration(object): goal_time = 0.2 def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass def time_iterrows(self): for row in self.df.iterrows(): pass
class Iteration(object): def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass def time_iterrows(self): for row in self.df.iterrows(): pass
def pivot_table(data: DataFrame, pivot_name: str = "pivot", value_name: str = "value") -> DataFrame: """ Put a table in our preferred format when the regions are columns and date is index """ dates = data.index.tolist() * len(data.columns) pivots: List[str] = sum([[name] * len(column) for name, column in data.iteritems()], []) values: List[Any] = sum([column.tolist() for name, column in data.iteritems()], []) records = zip(dates, pivots, values) return DataFrame.from_records(records, columns=[data.index.name, pivot_name, value_name])
def pivot_table(data: DataFrame, pivot_name: str = 'Pivot'): ''' Put a table in our preferred format when the regions are columns and date is index ''' dates = data.index.tolist() * len(data.columns) pivots = sum([[name] * len(column) for name, column in data.iteritems()], []) values = sum([column.tolist() for name, column in data.iteritems()], []) records = zip(dates, pivots, values) return DataFrame.from_records(records, columns=['Date', pivot_name, 'Value'])
def distplot(df: pd.DataFrame, bins=10, hist=True, kde=True, rug=False, color=None, as_figure=False, legend=True, title=True, grid=True, figsize=None, subplots=False, layout=None, sharex=False, sharey=False, **kwargs): """mimic seaborn.distplot""" df = pd.DataFrame(df).rename(columns=str) if not subplots: hist_data = [v.values for k, v in df.iteritems()] group_labels = df.columns.tolist() bin_size = (np.max(df.max()) - np.min(df.min())) / bins curve_type = 'kde' if kde else 'normal' fig = ff.create_distplot(hist_data=hist_data, group_labels=group_labels, bin_size=bin_size, curve_type=curve_type, colors=color, show_hist=hist, show_rug=rug, **kwargs) else: figures = [ distplot1d(ss, bins=bins, hist=hist, kde=kde, rug=rug, color=color, as_figure=True, **kwargs) for _, ss in df.iteritems() ] fig = tools.get_subplots(figures, sharex=sharex, sharey=sharey, layout=layout) fig['layout'].update(showlegend=legend, title=title) if figsize: fig['layout'].update(width=figsize[0], height=figsize[1]) for k, v in fig['layout'].items(): if 'axis' in k: v.update(showgrid=grid) if as_figure: return fig cf.iplot(fig)
def debugFirstRow(df: pd.DataFrame): for name, values in df.iteritems(): print('{name}: "{value}"'.format(name=name, value=values[0])) print("=+==============================") for name, values in df.iteritems(): print('{name}: "{value}"'.format(name=name, value=values[1])) print("=+==============================") for name, values in df.iteritems(): print('{name}: "{value}"'.format(name=name, value=values[2]))
def find_rows_with(df: DataFrame, tokens: List[str]) -> DataFrame: """ Finds the row number index for each string in a list of strings. The returned DataFrame is not representative of the order or the specific column of each specific string in the list of strings (i.e. tokens). It is simply a collective representing; "all of these strings are contained within these rows, and none others" :param df: DataFrame object to search through. :param tokens: The list of strings to look for within the DataFrame. :return: A DataFrame of rows that contain the list of strings passed. """ indices_containing_tokens: List[int] = list() for column_name, column_data in df.iteritems(): for token in tokens: if pandas.isna(token): contains_token = column_data.isna() else: contains_token = column_data.str.contains(token) token_rows = contains_token[contains_token == True] if len(token_rows) > 0: indices_containing_tokens = indices_containing_tokens + list( token_rows.index.values) return df.iloc[list(set(indices_containing_tokens))]
def calc_distance_matrix(self, vec_data: pd.DataFrame, single_data: dict) -> pd.DataFrame: """Constructs the distance matrix between the vectonumeric data and the individual BLM variables. Args: vec_data: pd.DataFrame containing the vectornumeric data. If None will fetch the vectornumeric data. single_data: dictionary containing the individual BLM data. If None will fetch individual BLM data. Returns: `pd.DataFrame` with as index the columns of the vector numeric data, as columns the blm names and contains the "distance" between each. """ self._logger.info( "Constructing distance matrix. This will take a while...") start_t = time.time() # calculate the distance matrix col_diff = partial(self._multi_column_diff, single_data=single_data) self._logger.debug("Using %i jobs.", self._n_jobs) # Split the vec_data into chunks for more efficient multiprocessing with Pool(self._n_jobs) as p: res = p.imap( col_diff, enumerate( chunkify([c for _, c in vec_data.iteritems()], self._n_jobs)), ) res = list(chain(*res)) self._logger.info("Time elapsed: %s s", round(time.time() - start_t)) return pd.DataFrame(res)
def ac_time(userID, startDate, endDate): # 最近一个月 # 时间分布 6点~7点宿舍打卡 # 23点到5点打卡 ACPeriodCate from GetJson_ACPeriodCate import GetJson_ACPeriodCate json_ACPeriodCate = GetJson_ACPeriodCate(userID, 2, startDate, endDate) if "errMsg" in json_ACPeriodCate: return {"count_early": -1, "count_night": -1} timeDistri = json_ACPeriodCate["json_timeDistribution"] dict_vals = {} for item in timeDistri["seriesData"]: dict_vals[item["name"]] = item["data"] df = DataFrame(dict_vals, index=range(24)) df["SUM"] = 0 for col, vals in df.iteritems(): if col == "SUM": break df["SUM"] += vals count_early = df.loc[6]["dorm"] if "dorm" in df else 0 # 取 6 点宿舍值,总计早起次数 count_night = sum(df.loc[0:6]["SUM"].tolist()) + df.loc[23]["SUM"] # 取23点 ~ 5点总门禁次数 return {"count_early": count_early, "count_night": count_night}
def get_metadata(data: pd.DataFrame, label: str, source_metadata=None): metadata = {} for count, (column, values) in enumerate(data.iteritems()): options = None data_type = values.dtype if column == label: data_type = "label" data[column] = data[column].astype(str) elif column == "id" or column == "iid": data_type = column elif data_type == "object": data_type = "categorical" options = list(set(values.tolist())) elif "int" in str(data_type) or "float" in str(data_type): data_type = "numeric" desc = { "fullname": column, "unit": None, "short": chars[count], "data_type": data_type, "options": ",".join(map(str, options)) if options else options, } metadata[column] = desc if source_metadata and "columns" in source_metadata: for col, val in source_metadata["columns"].items(): for k, v in val.items(): metadata[col][k] = v return metadata, data
def fit(self, df: pd.DataFrame) -> None: obj = {name: series for name, series in df.iteritems()} for step in self._steps: step.fit(obj) obj = step.transform(obj) return obj
def df_stats(df: pd.DataFrame, top_n=5): total = len(df) stats = pd.DataFrame(index=df.columns, columns=['dtype', 'distinct-cnt', 'non-null-cnt']) for col_name, series in df.iteritems(): series_dtype = series.dtype series_value_counts = series.value_counts() dist_cnt = len(series_value_counts) non_null_cnt = series.count() stats.loc[col_name] = [series_dtype, dist_cnt, non_null_cnt] name_str = f"{col_name}({series_dtype})".rjust(25, '-') dist_cnt_str = str(dist_cnt).rjust(6, ' ') non_null_cnt_str = str(non_null_cnt).rjust(6, ' ') print(f"{name_str} : count distinct - {dist_cnt_str} : " f"non-null/total - {non_null_cnt_str}/{total} = {non_null_cnt/total:.3f} ") if top_n > 0: col_value_count_list = [ "'" + str(c) + "'" + ":" + str(n) for c, n in sorted( series_value_counts.items(), key=lambda kv: kv[1], reverse=True ) ] print(", ".join(col_value_count_list[:min(len(col_value_count_list), top_n)])) stats['null-cnt'] = total - stats['non-null-cnt'] stats['non-null-ratio'] = stats['non-null-cnt'] / total stats['total'] = total return stats
def get_date_trend(self, mode_date): """ :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2) """ axisLabels = self.oriDate[:] pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues] rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'} df = DataFrame(pointVals, index=axisLabels) df = df.resample(rule_mode[str(mode_date)], how='sum') df = df.fillna(0) """各项总和""" # cols_name = [] # for name, col in df.iteritems(): # cols_name.append(name) # df['SUM'] = 0 # for i in xrange(len(cols_name)): # df['SUM'] += df[cols_name[i]] """宿舍比重""" # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0 # 仅当存在宿舍值时才计算宿舍比重,否则设为0 axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist()) # 从dataframe 中取出作为索引的日期标签成为队列 seriesData = [] legendLabels = [] for colName, col in df.iteritems(): legendLabels.append(colName) data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist()) seriesData.append({'name': colName, 'data': data}) json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData} return json_dateTrend
def fill_old(self, df, year=None): """ Takes an age, sex profile (per capita transfers) in df to fill year 'year' or all years if year is None """ if isinstance(df, DataFrame): df1 = df else: df1 = DataFrame(df) for col_name in df1.columns: if col_name not in self._types: self.new_type(col_name) if year is None: for yr in sorted(self.index_sets['year']): self.fill(df, year=yr) else: yr = year if isinstance(df, DataFrame): df1 = df else: df1 = DataFrame(df) for col_name, column in df1.iteritems(): column = column.reset_index() column['year'] = yr column = column.set_index(['age', 'sex', 'year']) self.update(column)
def get_norm_metadata_dict( data_df: pd.DataFrame, exclude_features: List[str], feature_overrides: Dict[str, str], max_unique_enum_values: int, quantile_size: int, quantile_k2_threshold: int, skip_box_cox: int, skip_quantiles: int, skip_preprocessing: bool, ) -> Dict: exclude_features = set(exclude_features) output = {} for col, data in data_df.iteritems(): if col in exclude_features: pass else: output[col] = _get_single_feature_norm_metadata( col, list(data), feature_overrides, max_unique_enum_values, quantile_size, quantile_k2_threshold, skip_box_cox, skip_quantiles, skip_preprocessing, ) return output
def groupby_country_groups( df: pd.DataFrame, country_groups: pd.DataFrame, drop_elements: Optional[List[str]] = None, keep_elements: Optional[List[str]] = None) -> pd.DataFrame: new_df = [] country_groups = country_groups.groupby( ['countrygroupcode', 'countrygroup'])['areacode'].apply(set) for group, codes in country_groups.iteritems(): countrygroupcode, countrygroup = group fltrd = df[df['areacode'].isin(codes)].drop( columns=['area', 'areacode']) fltrd = fltrd.groupby( ['itemcode', 'item', 'elementcode', 'element', 'unit', 'year'])['value'].apply(list).reset_index() fltrd = fltrd.assign(areacode=countrygroupcode).assign( area=countrygroup) fltrd = fltrd.assign( flag=fltrd['value'].apply(lambda x: get_flag(x, codes))) fltrd['value'] = fltrd['value'].apply(np.nansum) new_df.append(fltrd) df = pd.concat(new_df, sort=False).reset_index(drop=True) if drop_elements is not None: df = df[~df.elementcode.isin(drop_elements)] if keep_elements is not None: df = df[df.elementcode.isin(keep_elements)] df['year'] = df.year.astype('int') return df
def insert_defaults(table: pd.DataFrame, variables: list): '''Replaces null values (None/NaN) in a DataFrame with the corresponding default value from the variables. If a default value is not found (is None) the null is not changed Parameters table: Required, Type DataFrame, The table to be searched. variables: Required, Type dict, The dictionary of Variable objects containing the default values. Returns updated_table: A copy of the table DataFrame with null values replaced with default values. ''' #TODO make insert_defaults a Table method #TODO there is probably significant room for optimization updated_table = table.copy() # select columns with specified variables table_variables = set(table.columns.values) for (var_name, data) in table.iteritems(): var = variables.get(var_name) if var is not None: if var.default is not None: updated_table[var_name] = data.fillna(value=var.default) return updated_table
def our_mean_std(df: pd.DataFrame): res = {} for col, xs in df.iteritems(): xs = xs.values max_length = max(len(x) for x in xs) masks = [ np.concatenate([np.ones(len(x)), np.zeros(max_length - len(x))]) for x in xs ] xs = [ np.concatenate([np.array(x), np.zeros(max_length - len(x))]) for x in xs ] xs = np.concatenate([x[None, :] for x in xs], axis=0) masks = np.concatenate([mask[None, :] for mask in masks], axis=0) count = np.sum(masks, axis=0) mean = np.sum(xs, axis=0) / count xs -= mean[None, :] masks = masks.astype(np.bool) xs[~masks] = 0 res[f'{col}_count'] = count.tolist() count = count - 1 single = count == 0 count[single] = 1 std = np.sqrt(np.sum(xs**2, axis=0) / count) std[single] = 0 res[f'{col}_mean'] = mean.tolist() res[f'{col}_std'] = std.tolist() return pd.Series(res)
def normalize(train: pn.DataFrame, test: pn.DataFrame): for (columnName, columnData) in train.iteritems(): max, min = train[columnName].max(), train[columnName].min() train[columnName] = ( (train[columnName] - train[columnName].min()) / (train[columnName].max() - train[columnName].min())) test[columnName] = ((test[columnName] - min) / (max - min))
def fill_old(self, df, year = None): """ Takes an age, sex profile (per capita transfers) in df to fill year 'year' or all years if year is None """ if isinstance(df, DataFrame): df1 = df else: df1 = DataFrame(df) for col_name in df1.columns: if col_name not in self._types: self.new_type(col_name) if year is None: for yr in sorted(self.index_sets['year']): self.fill(df, year = yr) else: yr = year if isinstance(df, DataFrame): df1 = df else: df1 = DataFrame(df) for col_name, column in df1.iteritems(): column = column.reset_index() column['year'] = yr column = column.set_index(['age','sex','year']) self.update(column)
def _fit_catboost( self, X: pd.DataFrame, y: pd.Series, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None, tree_params: Optional[Dict[str, Any]] = None, fit_params: Optional[Dict[str, Any]] = None, ) -> 'catboost.CatBoostClassifier': if catboost is None: raise ImportError('catboost is not installed.') # Default settings if tree_params is None: tree_params = dict(eval_metric='BrierScore', loss_function='Logloss', iterations=100) if fit_params is None: is_cat_feature = [ c.dtype.name == 'category' for (_, c) in X.iteritems() ] fit_params = dict( cat_features=np.nonzero(is_cat_feature)[0].tolist(), verbose=True, ) if eval_set is not None: val_params = dict(early_stopping_rounds=10, eval_set=eval_set) fit_params = {**fit_params, **val_params} # Train the model model = catboost.CatBoostClassifier(**tree_params) return model.fit(X, y, **fit_params)
def prepare_data( dataset_df: pd.DataFrame, drop_na: bool = False, mean_int: bool = True, mean_float: bool = True, rescale_float: bool = True, standardize_float: bool = True, ) -> None: """Fill missing values and standardize float columns. :author: Robin Courant :param dataset_df: dataset to process. :param drop_na: whether to drop every row with at least on `NaN` cell. :param mean_int: whether to use mean or the median for missing integers. :param mean_float: whether to use mean or the median for missing floats. :param rescale_float: whether to rescale floats (standardize or normalize). :param standardize_float: whether to apply standardization or normalization. """ if drop_na: dataset_df.dropna() return for column_name, column_series in dataset_df.iteritems(): if is_integer_dtype(column_series): if set(column_series.unique()) == {0, 1}: dataset_df[column_name] = _prepare_bool(column_series) else: dataset_df[column_name] = _prepare_int(column_series, mean_int) elif is_float_dtype(column_series): dataset_df[column_name] = _prepare_float(column_series, mean_float, rescale_float, standardize_float) # Raise an error is the column's type is not boolean, integer or float else: raise TypeError(f"Unrecognized type, column: {column_name}")
def code_categories(self, data: DataFrame, encoder) -> Tuple[DataFrame, Dict[str, Dict]]: """Encoding categorical parameters Args: data (DataFrame): input dataset encoder: any object with fit_transform method Returns: pd.DataFrame: output dataset with encoded parameters dict: dictionary with values and codes """ columns = [col for col in data.columns.to_list() if self.nodes_types[col] == 'disc'] df = data.copy() # INPUT DF. Debugging SettingWithCopyWarning if not columns: return df, None data = df[columns] # DATA TO CATEGORIZE encoder_dict = dict() for col_name, column in data.iteritems(): # Iterate over (column name, Series) pairs. try: df[col_name] = encoder.fit_transform(column.values) except TypeError as exc: logger_preprocessor.error(f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}") try: mapping = dict(zip(encoder.classes_, range(len(encoder.classes_)))) encoder_dict[col_name] = mapping except: pass return df, encoder_dict
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel): """ Bert Inference for prediction. :param tweet: dataframe with tweets :param model: Bert Model :return: list of pr """ tweet = tweet.values.tolist() try: predictions, raw_outputs = model.predict(tweet) except: for element in tweet.iteritems(): model.predict([element]) print("STOPP") auswertung = collections.Counter(predictions) gc.collect() # df = pd.DataFrame(raw_outputs) # df['predictions'] = pd.DataFrame(predictions) # df['tweets'] = pd.DataFrame(tweet) # df = df.replace(r'\n', ' ', regex=True) # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1)) # df['softmax0'] = df_softmax[0] # df['softmax1'] = df_softmax[1] # db_functions.df_to_sql(df, 'temp_table', 'replace') return auswertung
def roc_analyze(classes: pandas.DataFrame, norm_data: pandas.DataFrame) -> Dict[str, ROC_curve_data]: result = {} for name, column in norm_data.iteritems(): fpr, tpr, threshold = metrics.roc_curve(classes, column) roc_auc = metrics.auc(fpr, tpr) result[name] = ROC_curve_data(fpr=fpr, tpr=tpr, threshold=threshold, auc=roc_auc) return result
def write_arff_file(dataset: pd.DataFrame, filename="dataset.arff", name="Universities"): with open(filename, "w", encoding="utf-8") as file: file.write(f"@RELATION {name}\n\n") max_len = len(max(dataset.columns, key=len)) for header in dataset.columns: if dataset[header].dtype == np.float64 or dataset[ header].dtype == np.int64: column_type = "NUMERIC" else: column_type = "STRING" file.write(f"@ATTRIBUTE {header.ljust(max_len)} {column_type}\n") file.write("\n@DATA\n") for _, column in dataset.iteritems(): if column.dtype == np.object: pattern = re.compile(r"^(.*)$") dataset[column.name] = column.str.replace(pattern, r'"\1"') for _, row in dataset.iterrows(): items = [str(x) for x in row] items = [x if x != "nan" else "?" for x in items] file.write(f"{', '.join(items)}\n")
def convert_units(df: pd.DataFrame) -> pd.DataFrame: """Change units of measurement from source to destination standards""" rename = dict() for col, s in df.iteritems(): (observed_property, unit_of_measurement) = col try: conversion = settings.UNIT_MAP[unit_of_measurement][ observed_property] except KeyError: LOGGER.error(col) raise # Map column names rename[col] = conversion['label'] # Calculate conversion factor = float(conversion['factor']) s = s.mul(factor) df[col] = s df = df.rename(columns=rename, errors='raise') return df
def get_product_parent(src: pd.DataFrame) -> tuple: src = src['product_parent'].value_counts() _product_parent = list() _review_count = list() for p, r in src.iteritems(): _product_parent.append(p) _review_count.append(r) return _product_parent, _review_count
def __get_best_attribute(self, x: pandas.DataFrame, y: pandas.Series) -> str: x_entropy = self.__get_entropy(y) information_gains = pandas.Series([]) for attribute, series in x.iteritems(): attribute_entropy = [(y[series == value].size / y.size) * self.__get_entropy(y[series == value]) for value in series.unique()] information_gains[attribute] = x_entropy - sum(attribute_entropy) return information_gains.idxmax()
def get_unique_elements(df: pd.DataFrame) -> np.ndarray: """Returns all unique elements found in a multiple sequence alignment.""" U = np.array([]) for name, seq in df.iteritems(): U = np.append(U, seq.unique()) return np.unique(U)
def asset_beta(self, df: pandas.DataFrame, market_asset: str): import numpy as np beta_matrix = {} for index, col in df.iteritems(): beta = df[[index, market_asset ]].cov().iloc[0, 1] / df[market_asset].var() beta_matrix[index] = beta return beta_matrix
def aggregate_review(review_list): """ レビュー記録を集計 """ aggregate = {} df = DataFrame(review_list) for col, item in df.iteritems(): aggregate[col] = df.groupby(col).size().to_dict() return aggregate
def sharpe_ratio(self, df: pandas.DataFrame, market_asset: str): import numpy as np sharpe_matrix = {} for index, col in df.iteritems(): sharpe_ratio = np.sqrt(250) * ( df[index].mean() - self.risk_free_rate / 250) / df[index].std() sharpe_matrix[index] = sharpe_ratio return sharpe_matrix
def drop_outliers(features: pd.DataFrame) -> pd.DataFrame: outliers = set() for col, vals in features.iteritems(): lower, med, upper = np.percentile(vals, [25, 50, 75]) scale = np.abs(upper - lower) outliers.update(vals[(vals < med - 3 * scale) | (vals > med + 3 * scale)].index) LOG.info("%d outliers removed", len(outliers)) return features.drop(outliers, axis=0)
def rolling_mean(data, window, min_periods=1, center=False): ''' Function that computes a rolling mean Parameters ---------- data : DataFrame or Series If a DataFrame is passed, the rolling_mean is computed for all columns. window : int or string If int is passed, window is the number of observations used for calculating the statistic, as defined by the function pd.rolling_mean() If a string is passed, it must be a frequency string, e.g. '90S'. This is internally converted into a DateOffset object, representing the window size. min_periods : int Minimum number of observations in window required to have a value. Returns ------- Series or DataFrame, if more than one column ''' def f(x): '''Function to apply that actually computes the rolling mean''' if center == False: dslice = col[x-pd.datetools.to_offset(window).delta+timedelta(0,0,1):x] # adding a microsecond because when slicing with labels start and endpoint # are inclusive else: dslice = col[x-pd.datetools.to_offset(window).delta/2+timedelta(0,0,1): x+pd.datetools.to_offset(window).delta/2] if dslice.size < min_periods: return np.nan else: return dslice.mean() data = DataFrame(data.copy()) dfout = DataFrame() if isinstance(window, int): dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center) elif isinstance(window, basestring): idx = Series(data.index.to_pydatetime(), index=data.index) for colname, col in data.iteritems(): result = idx.apply(f) result.name = colname dfout = dfout.join(result, how='outer') if dfout.columns.size == 1: dfout = dfout.ix[:,0] return dfout
def test_sequence_like_with_categorical(self): # GH 7839 # make sure can iterate df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing result = list(df.grade.values) expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result, expected) # iteration for t in df.itertuples(index=False): str(t) for row, s in df.iterrows(): str(s) for c, col in df.iteritems(): str(s)
def get_time_distribution(self): dates = self.oriDate[:] values = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues] # 生成时间点和时间标签队列。 periods = [] axisLabels = [] for i in xrange(24): periods.append(time(i)) axisLabels.append(str(i) + u'点~' + str((i + 1) % 24) + u'点') # 时间点队列 -> 时间区间队列。 periodRanges = [] for i in xrange(len(periods)): periodRange = [periods[i], periods[(i + 1) % len(periods)]] periodRanges.append(periodRange) lTimes = map(lambda d: d.time(), dates) # Keep time. vals = [] # Init vals for i in xrange(len(periods)): vals.append({}) # Add to total vals. for i in xrange(len(lTimes)): for j in xrange(len(periodRanges)): if periodRanges[j][0] <= lTimes[i] < periodRanges[j][1]: vals[j + 1] = helpers.mergeDict(vals[j + 1], values[i]) df = DataFrame(vals) seriesData = [] legendLabels = [] for colName, col in df.iteritems(): legendLabels.append(colName) data = map(lambda x: 0 if isnan(x) else int(x), col.tolist()) seriesData.append({'name': colName, 'data': data}) json_timeDistribution = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData} return json_timeDistribution
def upsert_unique_indices(apps, schema_editor): datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date'] historical_dps = DataFrame(list(DataPoint.objects.all()\ .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list) # create the unique index historical_dps = historical_dps.apply(add_unique_index, axis=1) # group by and max on created at, get the most recent upload historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first() # get the ids into a list and select them dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id'])) print 'dps to update' print len(dps_to_update) # then run a query and update each for dp in dps_to_update: unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index'] dp.unique_index = unique_index dp.save() # delete all the other duplicates dps_to_delete = DataPoint.objects.all().exclude(id__in=list(historical_dps['id'])) print 'dps_to_delete' print len(dps_to_delete) dps_to_delete.delete() dataframe_columns = ['id','created_at','indicator_id','location_id','campaign_id','data_date', 'unique_index'] # make sure there aren't duplicate dps now. all_dps = DataFrame(list(DataPoint.objects.all()\ .values_list('unique_index')), columns=['unique_index']) all_dps = all_dps.groupby('unique_index').size() for idx, dp in all_dps.iteritems(): if dp != 1: raise Exception("there are duplicate datapoints")
def run_clinical_real(cancer, clinical, data_path, gene_sets, survival_tests, real_variables, binary_variables, data_type='expression', drop_pc=False): if data_type == 'expression': data_matrix = read_rnaSeq(cancer, data_path) data_matrix = data_matrix.groupby(by=lambda n: n.split('|')[0]).mean() elif data_type == 'expression_array': data_matrix = read_mrna(cancer, data_path) elif data_type == 'methylation': data_matrix = read_methylation(cancer, data_path) if drop_pc: data_matrix = drop_first_norm_pc(data_matrix) pc = dict((p, extract_pc(data_matrix.ix[g])) for p, g in gene_sets.iteritems()) pc = DataFrame(dict((p, (v - v.mean()) / v.std()) for p,v in pc.iteritems() if type(v) != type(None))).T #clinical['pc'] = extract_pc(data_matrix.dropna(), pc_threshold=0) tests = get_tests(clinical, survival_tests, real_variables, binary_variables, var_type='real') #return locals() p_pathways, q_pathways = run_tests(tests, pc) return locals()
class FrameParser(Parser): _default_orient = 'columns' _split_keys = ('columns', 'index', 'data') def _parse_numpy(self): json = self.json orient = self.orient if orient == "columns": args = loads(json, dtype=None, numpy=True, labelled=True, precise_float=self.precise_float) if args: args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": decoded = loads(json, dtype=None, numpy=True, precise_float=self.precise_float) decoded = dict((str(k), v) for k, v in compat.iteritems(decoded)) self.check_keys_split(decoded) self.obj = DataFrame(**decoded) elif orient == "values": self.obj = DataFrame(loads(json, dtype=None, numpy=True, precise_float=self.precise_float)) else: self.obj = DataFrame(*loads(json, dtype=None, numpy=True, labelled=True, precise_float=self.precise_float)) def _parse_no_numpy(self): json = self.json orient = self.orient if orient == "columns": self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) elif orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( json, precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None).T else: self.obj = DataFrame( loads(json, precise_float=self.precise_float), dtype=None) def _process_converter(self, f, filt=None): """ take a conversion function and possibly recreate the frame """ if filt is None: filt = lambda col, c: True needs_new_obj = False new_obj = dict() for i, (col, c) in enumerate(self.obj.iteritems()): if filt(col, c): new_data, result = f(col, c) if result: c = new_data needs_new_obj = True new_obj[i] = c if needs_new_obj: # possibly handle dup columns new_obj = DataFrame(new_obj, index=self.obj.index) new_obj.columns = self.obj.columns self.obj = new_obj def _try_convert_types(self): if self.obj is None: return if self.convert_dates: self._try_convert_dates() self._process_converter( lambda col, c: self._try_convert_data(col, c, convert_dates=False)) def _try_convert_dates(self): if self.obj is None: return # our columns to parse convert_dates = self.convert_dates if convert_dates is True: convert_dates = [] convert_dates = set(convert_dates) def is_ok(col): """ return if this col is ok to try for a date parse """ if not isinstance(col, compat.string_types): return False col_lower = col.lower() if (col_lower.endswith('_at') or col_lower.endswith('_time') or col_lower == 'modified' or col_lower == 'date' or col_lower == 'datetime' or col_lower.startswith('timestamp')): return True return False self._process_converter( lambda col, c: self._try_convert_to_date(c), lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates))
def get_sensitivity_analysis(extracts, points, statics, initials, pickle=None): temps = range(-5, 6) all_pct = [x * 0.1 for x in range(5, 16)] ndvi_range = linspace(0.9, 1.7, 11) ndvi_range = array([round_to_value(x, 0.05) for x in ndvi_range]) var_arrs = [] y = 0 for x in range(0, 6): ones_ = ones((5, 11), dtype=float) zeros = [x * 0.0 for x in range(5, 16)] norm_ndvi = array([1.25 for x in zeros]) if y == 0: arr = insert(ones_, y, temps, axis=0) arr = insert(arr, 4, norm_ndvi, axis=0) arr = arr[0:6] var_arrs.append(arr) arr = [] elif y == 4: arr = insert(ones_, 0, zeros, axis=0) arr = insert(arr, y, ndvi_range, axis=0) arr = arr[0:6] var_arrs.append(arr) print 'shape arr: {}'.format(arr.shape) arr = [] elif y == 5: arr = insert(ones_, 0, zeros, axis=0) arr = insert(arr, 4, norm_ndvi, axis=0) arr = arr[0:5] arr = insert(arr, y, all_pct, axis=0) var_arrs.append(arr) arr = [] else: arr = insert(ones_, 0, zeros, axis=0) arr = insert(arr, y, all_pct, axis=0) arr = insert(arr, 4, norm_ndvi, axis=0) arr = arr[0:6] var_arrs.append(arr) arr = [] y += 1 print 'variable arrays: {}'.format(var_arrs) normalize_list = [2, 0.20, 0.20, 2, 0.20, 0.50] # site_list = ['Bateman', 'Navajo_Whiskey_Ck', 'Quemazon', 'Sierra_Blanca', 'SB_1', 'SB_2', 'SB_4', 'SB_5', 'VC_1', # 'VC_2', 'VC_3', 'CH_1', 'CH_3', 'MG_1', 'MG_2', 'WHLR_PK', 'LP', 'South_Baldy', # 'Water_Canyon', 'La_Jencia', 'Socorro'] site_list = ['Sierra_Blanca', 'Great_Western_Mine', 'Bonito', 'Nogal'] df = DataFrame(columns=FACTORS, index=site_list) df_norm = DataFrame(columns=FACTORS, index=site_list) site_dict = {'Sierra_Blanca': {}, 'Great_Western_Mine': {}, 'Bonito': {}, 'Nogal': {}} ds = Open(points) lyr = ds.GetLayer() # defs = lyr.GetLayerDefn() for j, feat in enumerate(lyr): name = feat.GetField("Name") name = name.replace(' ', '_') geom = feat.GetGeometryRef() mx, my = int(geom.GetX()), int(geom.GetY()) site_dict[name]['Coords'] = '{} {}'.format(mx, my) file_name = os.path.join(extracts, '{}.csv'.format(name)) print file_name site_dict[name]['etrm'] = get_etrm_time_series(file_name, single_file=True) # print 'site dict before running etrm: {}'.format(site_dict) for i, var_arr in enumerate(var_arrs): factor = FACTORS[i] print 'running modified factor: {}'.format(factor) print '' for key, val in site_dict.iteritems(): print '\n site: {} \n '.format(key) results = [] for col in var_arr.T: etrm = Processes(SIMULATION_PERIOD, static_inputs=statics, initial_inputs=initials, output_root=pickle, point_dict=site_dict) tracker = etrm.run(point_dict=site_dict, point_dict_key=key, sensitivity_matrix_column=col, sensitivity=True) # print 'tracker: {}'.format(tracker) results.append(tracker['tot_infil'][-1]) print 'total infil: {} \n results: {}'.format(tracker['tot_infil'][-1], results) df.iloc[site_list.index(key), FACTORS.index(factor)] = divide(array(results), 14.0) print 'df after site {}: \n {}'.format(key, df) print 'df: {}'.format(df) # tot_data : precip, et, tot_transp, tot_evap, infil, runoff, snow_fall, cum_mass, end_mass # "SI = [Q(Po + delP] -Q(Po - delP] / (2 * delP)" # where SI = Sensitivity Index, Q = recharge, Po = base value of input parameter, # delP = change in value input # find sensitivity index xx = 0 for param in df.iteritems(): data_cube = param[1] var_arr = var_arrs[xx] yy = 0 for site in data_cube: site_name = site_list[yy] normal = normalize_list[xx] site_obj = [x for x in site] sens_list = [] zz = 0 for var in var_arr[xx]: if var != var_arr[xx][5]: base = var_arr[xx][5] deltap = var - base obj = site_obj[zz] sen = ((obj * (base + deltap) - obj * (base - deltap)) / (2 * deltap)) * normal sens_list.append(sen) zz += 1 sens_list = array(sens_list) df_norm.iloc[site_list.index(site_name), FACTORS.index(param[0])] = sens_list if yy == 20: print 'done' break yy += 1 xx += 1 # why not save the data as pickle, so we don't have to do the analysis each time # we debug the plotting df.to_pickle(os.path.join(pickle, '_basic_sensitivity_2.pkl')) df_norm.to_pickle(os.path.join(pickle, 'norm_sensitivity_2.pkl'))
class FrameParser(Parser): _default_orient = 'columns' def _parse(self): json = self.json dtype = self.dtype orient = self.orient numpy = self.numpy if numpy: try: if orient == "columns": args = loads(json, dtype=dtype, numpy=True, labelled=True) if args: args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": decoded = loads(json, dtype=dtype, numpy=True) decoded = dict((str(k), v) for k, v in decoded.iteritems()) self.obj = DataFrame(**decoded) elif orient == "values": self.obj = DataFrame(loads(json, dtype=dtype, numpy=True)) else: self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True, labelled=True)) except ValueError: numpy = False if not numpy: if orient == "columns": self.obj = DataFrame(loads(json), dtype=dtype) elif orient == "split": decoded = dict((str(k), v) for k, v in loads(json).iteritems()) self.obj = DataFrame(dtype=dtype, **decoded) elif orient == "index": self.obj = DataFrame(loads(json), dtype=dtype).T else: self.obj = DataFrame(loads(json), dtype=dtype) def _convert_axes(self): """ try to axes if they are datelike """ if self.orient == 'columns': axis = 'index' elif self.orient == 'index': axis = 'columns' else: return try: a = getattr(self.obj,axis) setattr(self.obj,axis,self._try_parse_to_date(a)) except: pass def _try_parse_dates(self): if self.obj is None: return # our columns to parse parse_dates = self.parse_dates if parse_dates is True: parse_dates = [] parse_dates = set(parse_dates) def is_ok(col): """ return if this col is ok to try for a date parse """ if not isinstance(col, basestring): return False if (col.endswith('_at') or col.endswith('_time') or col.lower() == 'modified' or col.lower() == 'date' or col.lower() == 'datetime'): return True return False for col, c in self.obj.iteritems(): if (self.keep_default_dates and is_ok(col)) or col in parse_dates: self.obj[col] = self._try_parse_to_date(c)
class Iteration: # mem_itertuples_* benchmarks are slow timeout = 120 def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples_start(self): self.df4.itertuples() def time_itertuples_read_first(self): next(self.df4.itertuples()) def time_itertuples(self): for row in self.df4.itertuples(): pass def time_itertuples_to_list(self): list(self.df4.itertuples()) def mem_itertuples_start(self): return self.df4.itertuples() def peakmem_itertuples_start(self): self.df4.itertuples() def mem_itertuples_read_first(self): return next(self.df4.itertuples()) def peakmem_itertuples(self): for row in self.df4.itertuples(): pass def mem_itertuples_to_list(self): return list(self.df4.itertuples()) def peakmem_itertuples_to_list(self): list(self.df4.itertuples()) def time_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def time_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def time_itertuples_raw_tuples(self): for row in self.df4.itertuples(index=False, name=None): pass def time_itertuples_raw_tuples_to_list(self): list(self.df4.itertuples(index=False, name=None)) def mem_itertuples_raw_start(self): return self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw(self): for row in self.df4.itertuples(index=False, name=None): pass def mem_itertuples_raw_to_list(self): return list(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw_to_list(self): list(self.df4.itertuples(index=False, name=None)) def time_iterrows(self): for row in self.df.iterrows(): pass
class TestHashing(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): self.df = DataFrame( {'i32': np.array([1, 2, 3] * 3, dtype='int32'), 'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'), 'cat': Series(['a', 'b', 'c'] * 3).astype('category'), 'obj': Series(['d', 'e', 'f'] * 3), 'bool': np.array([True, False, True] * 3), 'dt': Series(pd.date_range('20130101', periods=9)), 'dt_tz': Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), 'td': Series(pd.timedelta_range('2000', periods=9))}) def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected) def test_hash_array(self): for name, s in self.df.iteritems(): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) self.assertFalse((a == b).all()) def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex()]: self.check_equal(obj) self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self): for name, s in self.df.iteritems(): self.check_equal(s) self.check_not_equal_with_index(s) def test_hash_pandas_empty_object(self): for obj in [Series([], dtype='float64'), Series([], dtype='object'), Index([])]: self.check_equal(obj) # these are by-definition the same with # or w/o the index as the data is empty def test_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: def f(): hash_pandas_object(f) self.assertRaises(TypeError, f) def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') self.assertTrue((a != b).all()) def test_invalid_key(self): # this only matters for object dtypes def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) def test_unsupported_objects(self): # mixed objects are not supported obj = Series(['1', 2, 3]) def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) # MultiIndex are represented as tuples obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( [('a', 1), ('a', 2), ('b', 1)])) def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) def test_alread_encoded(self): # if already encoded then ok obj = Series(list('abc')).str.encode('utf8') self.check_equal(obj) def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') def test_long_strings(self): obj = Index(tm.rands_array(nchars=10000, size=100)) self.check_equal(obj)
class TestHashing(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): self.df = DataFrame( {'i32': np.array([1, 2, 3] * 3, dtype='int32'), 'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'), 'cat': Series(['a', 'b', 'c'] * 3).astype('category'), 'obj': Series(['d', 'e', 'f'] * 3), 'bool': np.array([True, False, True] * 3), 'dt': Series(pd.date_range('20130101', periods=9)), 'dt_tz': Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), 'td': Series(pd.timedelta_range('2000', periods=9))}) def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected) def test_hash_array(self): for name, s in self.df.iteritems(): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) self.assertFalse((a == b).all()) def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex()]: self.check_equal(obj) self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self): for name, s in self.df.iteritems(): self.check_equal(s) self.check_not_equal_with_index(s) def test_hash_pandas_empty_object(self): for obj in [Series([], dtype='float64'), Series([], dtype='object'), Index([])]: self.check_equal(obj) # these are by-definition the same with # or w/o the index as the data is empty def test_categorical_consistency(self): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4))]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) def test_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: def f(): hash_pandas_object(f) self.assertRaises(TypeError, f) def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') self.assertTrue((a != b).all()) def test_invalid_key(self): # this only matters for object dtypes def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') self.assertRaises(ValueError, f) def test_unsupported_objects(self): # mixed objects are not supported obj = Series(['1', 2, 3]) def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) # MultiIndex are represented as tuples obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( [('a', 1), ('a', 2), ('b', 1)])) def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) def test_alread_encoded(self): # if already encoded then ok obj = Series(list('abc')).str.encode('utf8') self.check_equal(obj) def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') def test_same_len_hash_collisions(self): for l in range(8): length = 2**(l + 8) + 1 s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') self.assertFalse(result[0] == result[1]) for l in range(8): length = 2**(l + 8) s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') self.assertFalse(result[0] == result[1]) def test_hash_collisions(self): # hash collisions are bad # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa # these should be different! result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') expected1 = np.array([14963968704024874985], dtype=np.uint64) self.assert_numpy_array_equal(result1, expected1) result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') expected2 = np.array([16428432627716348016], dtype=np.uint64) self.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(L, dtype=object), 'utf8') self.assert_numpy_array_equal( result, np.concatenate([expected1, expected2], axis=0))
def train(self, file_name=DEFAULT_TRAINING_FILE): """ Takes in a training file formatted where the entire body of text is parsed each individual word or punctuation. On each line, there is also an identification stating the actual part of speech for that word. Args: file_name (str): The path to the file contain the training data """ # Create connection to the file training_file = None # Open the specific training file if os.path.exists(file_name): try: training_file = open(file_name) except IOError: print("Unable to open the file at " + file_name + ".") # Pull probablistic data from these files tags = array_column(self.tags, "Tag") vocabulary = set() # Pandas copies memory over for appending so must iterate for unique # words first for line in training_file: # Ensure this doesn't just return an empty line line = line.strip() if len(line) > 0: # Parse line into 'observation/classification' words = line.split(' ') # Iterate over each word to get the word and classification for word in words: # Separate into tag & classification context = word.rsplit('/', maxsplit=1) word = context[0].lower().strip() if word not in vocabulary: vocabulary.add(word) # Prepare necessary data structures emission = DataFrame(index=vocabulary, columns=tags) transition = DataFrame(index=tags, columns=tags) emission.fillna(0, inplace=True) transition.fillna(0, inplace=True) last_class = None # Iterate to update the emissions and training_file.seek(0) for line in training_file: # Ensure this doesn't just return an empty line line = line.strip() if len(line) > 0: # Parse line into 'observation/classification' words = line.split(' ') # Iterate over each word to get the word and classification for word in words: # Separate into tag & classification context = word.rsplit('/', maxsplit=1) word = context[0].lower().strip() context_tags = context[1].split('+') # Update the emission matrix for context_tag in context_tags: emission[context_tag][word] += 1 # Update the transition if last_class != None: for context_tag in context_tags: for last_tag in last_class: transition[last_tag][context_tag] += 1 # Update the last_class last_class = context_tags # Pull info from the database that needs to be updated & merge arrays cursor = self.connection.cursor() word_totals = {} tag_totals = {} for dest_tag, row in transition.iteritems(): for origin_tag, occurrence in row.iteritems(): # Retrieve total occurence data if it could not be found if origin_tag not in tag_totals: cursor.execute('SELECT TotalOccurrences FROM Tags WHERE Tag = ?', (origin_tag.upper().strip(),)) tag_totals[origin_tag] = cursor.fetchone() if tag_totals[origin_tag] == None: cursor.execute('INSERT INTO Tags (Tag, TotalOccurrences) VALUES (?, ?)', (origin_tag.upper().strip(), 1)) tag_totals[origin_tag] = 1 else: tag_totals[origin_tag] = int(tag_totals[origin_tag]['TotalOccurrences']) tag_totals[origin_tag] += int(occurrence) # Grab data for this specific transition cursor.execute('SELECT Occurrences FROM Transitions WHERE OriginTag = ? AND DestTag = ?', (origin_tag.upper().strip(), dest_tag)) db_occurrence = cursor.fetchone() if db_occurrence == None: # We need to add one if it doesn't exists cursor.execute('INSERT INTO Transitions (OriginTag, DestTag, Occurrences) VALUES (?, ?, ?)', (origin_tag.upper().strip(), dest_tag, 0)) db_occurrence = 0 else: db_occurrence = db_occurrence['Occurrences'] # Update the data db_occurrence += int(occurrence) cursor.execute('UPDATE Transitions SET Occurrences = ? WHERE OriginTag = ? AND DestTag = ?', (int(db_occurrence), origin_tag.upper().strip(), dest_tag)) for tag, row in emission.iteritems(): for word, occurrence in row.iteritems(): # Retrieve total occurence data if it could not be found if word not in word_totals: cursor.execute('SELECT TotalOccurrences FROM Words WHERE Word = ?', (word,)) word_totals[word] = cursor.fetchone() if word_totals[word] == None: cursor.execute('INSERT INTO Words (Word, TotalOccurrences) VALUES (?, ?)', (word, 1)) word_totals[word] = 1 else: word_totals[word] = int(word_totals[word]['TotalOccurrences']) word_totals[word] += int(occurrence) # Grab data for this specific emission cursor.execute('SELECT Occurrences FROM Emissions WHERE Word = ? AND Tag = ?', (word, tag.upper().strip())) db_occurrence = cursor.fetchone() if db_occurrence == None: # We need to add the entry cursor.execute('INSERT INTO Emissions (Word, Tag, Occurrences) VALUES (?, ?, ?)', (word, tag.upper().strip(), 0)) db_occurrence = 0 else: db_occurrence = db_occurrence['Occurrences'] # Update the data db_occurrence += int(occurrence) cursor.execute('UPDATE Emissions SET Occurrences = ? WHERE Word = ? AND Tag = ?', (int(db_occurrence), word, tag.upper().strip())) # Update totals in general for word, occurence in word_totals.items(): cursor.execute('UPDATE Words SET TotalOccurrences = ? WHERE Word = ?', (int(occurence), word)) for tag, occurence in tag_totals.items(): cursor.execute('UPDATE Tags SET TotalOccurrences = ? WHERE tag = ?', (int(occurence), (tag.upper().strip()))) # Close unnecessary resources cursor.close() self.connection.commit() training_file.close()
class TestHashing(object): def setup_method(self, method): self.df = DataFrame( {'i32': np.array([1, 2, 3] * 3, dtype='int32'), 'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'), 'cat': Series(['a', 'b', 'c'] * 3).astype('category'), 'obj': Series(['d', 'e', 'f'] * 3), 'bool': np.array([True, False, True] * 3), 'dt': Series(pd.date_range('20130101', periods=9)), 'dt_tz': Series(pd.date_range('20130101', periods=9, tz='US/Eastern')), 'td': Series(pd.timedelta_range('2000', periods=9))}) def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected) def test_hash_array(self): for name, s in self.df.iteritems(): a = s.values tm.assert_numpy_array_equal(hash_array(a), hash_array(a)) def test_hash_array_mixed(self): result1 = hash_array(np.array([3, 4, 'All'])) result2 = hash_array(np.array(['3', '4', 'All'])) result3 = hash_array(np.array([3, 4, 'All'], dtype=object)) tm.assert_numpy_array_equal(result1, result2) tm.assert_numpy_array_equal(result1, result3) def test_hash_array_errors(self): for val in [5, 'foo', pd.Timestamp('20130101')]: pytest.raises(TypeError, hash_array, val) def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) if len(obj): assert not (a == b).all() def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) assert result == expected[0] def test_hash_tuple(self): # test equivalence between hash_tuples and hash_tuple for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), ('A', pd.Timestamp("2012-01-01"))]: result = hash_tuple(tup) expected = hash_tuples([tup])[0] assert result == expected def test_hash_scalar(self): for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01", tz='Europe/Brussels'), datetime.datetime(2012, 1, 1), pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), pd.Timedelta('1 days'), datetime.timedelta(1), pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), np.nan, pd.NaT, None]: result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) assert result[0] == expected[0] def test_hash_tuples_err(self): for val in [5, 'foo', pd.Timestamp('20130101')]: pytest.raises(TypeError, hash_tuples, val) def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique result = hash_pandas_object(mi) assert result.is_unique def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object( mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object( recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product( [range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2)]), MultiIndex.from_product( [pd.CategoricalIndex(list('aabc')), range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj) def test_hash_pandas_object2(self): for name, s in self.df.iteritems(): self.check_equal(s) self.check_not_equal_with_index(s) def test_hash_pandas_empty_object(self): for obj in [Series([], dtype='float64'), Series([], dtype='object'), Index([])]: self.check_equal(obj) # these are by-definition the same with # or w/o the index as the data is empty def test_categorical_consistency(self): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4))]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3) def test_categorical_with_nan_consistency(self): c = pd.Categorical.from_codes( [-1, 0, 1, 2, 3, 4], categories=pd.date_range('2012-01-01', periods=5, name='B')) expected = hash_array(c, categorize=False) c = pd.Categorical.from_codes( [-1, 0], categories=[pd.Timestamp('2012-01-01')]) result = hash_array(c, categorize=False) assert result[0] in expected assert result[1] in expected def test_pandas_errors(self): for obj in [pd.Timestamp('20130101')]: with pytest.raises(TypeError): hash_pandas_object(obj) with catch_warnings(record=True): obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj) def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') assert (a != b).all() def test_invalid_key(self): # this only matters for object dtypes def f(): hash_pandas_object(Series(list('abc')), hash_key='foo') pytest.raises(ValueError, f) def test_alread_encoded(self): # if already encoded then ok obj = Series(list('abc')).str.encode('utf8') self.check_equal(obj) def test_alternate_encoding(self): obj = Series(list('abc')) self.check_equal(obj, encoding='ascii') def test_same_len_hash_collisions(self): for l in range(8): length = 2**(l + 8) + 1 s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1] for l in range(8): length = 2**(l + 8) s = tm.rands_array(length, 2) result = hash_array(s, 'utf8') assert not result[0] == result[1] def test_hash_collisions(self): # hash collisions are bad # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa 'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa # these should be different! result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8') expected1 = np.array([14963968704024874985], dtype=np.uint64) tm.assert_numpy_array_equal(result1, expected1) result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8') expected2 = np.array([16428432627716348016], dtype=np.uint64) tm.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(L, dtype=object), 'utf8') tm.assert_numpy_array_equal( result, np.concatenate([expected1, expected2], axis=0))