Beispiel #1
0
class Iteration(object):

    goal_time = 0.2

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Beispiel #2
0
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Beispiel #3
0
def pivot_table(data: DataFrame, pivot_name: str = "pivot", value_name: str = "value") -> DataFrame:
    """ Put a table in our preferred format when the regions are columns and date is index """
    dates = data.index.tolist() * len(data.columns)
    pivots: List[str] = sum([[name] * len(column) for name, column in data.iteritems()], [])
    values: List[Any] = sum([column.tolist() for name, column in data.iteritems()], [])
    records = zip(dates, pivots, values)
    return DataFrame.from_records(records, columns=[data.index.name, pivot_name, value_name])
Beispiel #4
0
def pivot_table(data: DataFrame, pivot_name: str = 'Pivot'):
    ''' Put a table in our preferred format when the regions are columns and date is index '''
    dates = data.index.tolist() * len(data.columns)
    pivots = sum([[name] * len(column) for name, column in data.iteritems()], [])
    values = sum([column.tolist() for name, column in data.iteritems()], [])
    records = zip(dates, pivots, values)
    return DataFrame.from_records(records, columns=['Date', pivot_name, 'Value'])
Beispiel #5
0
def distplot(df: pd.DataFrame,
             bins=10,
             hist=True,
             kde=True,
             rug=False,
             color=None,
             as_figure=False,
             legend=True,
             title=True,
             grid=True,
             figsize=None,
             subplots=False,
             layout=None,
             sharex=False,
             sharey=False,
             **kwargs):
    """mimic seaborn.distplot"""
    df = pd.DataFrame(df).rename(columns=str)
    if not subplots:
        hist_data = [v.values for k, v in df.iteritems()]
        group_labels = df.columns.tolist()
        bin_size = (np.max(df.max()) - np.min(df.min())) / bins
        curve_type = 'kde' if kde else 'normal'
        fig = ff.create_distplot(hist_data=hist_data,
                                 group_labels=group_labels,
                                 bin_size=bin_size,
                                 curve_type=curve_type,
                                 colors=color,
                                 show_hist=hist,
                                 show_rug=rug,
                                 **kwargs)
    else:
        figures = [
            distplot1d(ss,
                       bins=bins,
                       hist=hist,
                       kde=kde,
                       rug=rug,
                       color=color,
                       as_figure=True,
                       **kwargs) for _, ss in df.iteritems()
        ]
        fig = tools.get_subplots(figures,
                                 sharex=sharex,
                                 sharey=sharey,
                                 layout=layout)

    fig['layout'].update(showlegend=legend, title=title)
    if figsize:
        fig['layout'].update(width=figsize[0], height=figsize[1])
    for k, v in fig['layout'].items():
        if 'axis' in k:
            v.update(showgrid=grid)
    if as_figure:
        return fig
    cf.iplot(fig)
Beispiel #6
0
def debugFirstRow(df: pd.DataFrame):
    for name, values in df.iteritems():
        print('{name}: "{value}"'.format(name=name, value=values[0]))

    print("=+==============================")
    for name, values in df.iteritems():
        print('{name}: "{value}"'.format(name=name, value=values[1]))

    print("=+==============================")

    for name, values in df.iteritems():
        print('{name}: "{value}"'.format(name=name, value=values[2]))
def find_rows_with(df: DataFrame, tokens: List[str]) -> DataFrame:
    """
    Finds the row number index for each string in a list of strings. The
    returned DataFrame is not representative of the order or the specific column
    of each specific string in the list of strings (i.e. tokens).

    It is simply a collective representing;

        "all of these strings are contained within these rows, and none others"

    :param df: DataFrame object to search through.
    :param tokens: The list of strings to look for within the DataFrame.
    :return: A DataFrame of rows that contain the list of strings passed.
    """

    indices_containing_tokens: List[int] = list()

    for column_name, column_data in df.iteritems():
        for token in tokens:
            if pandas.isna(token):
                contains_token = column_data.isna()
            else:
                contains_token = column_data.str.contains(token)

            token_rows = contains_token[contains_token == True]

            if len(token_rows) > 0:
                indices_containing_tokens = indices_containing_tokens + list(
                    token_rows.index.values)

    return df.iloc[list(set(indices_containing_tokens))]
Beispiel #8
0
    def calc_distance_matrix(self, vec_data: pd.DataFrame,
                             single_data: dict) -> pd.DataFrame:
        """Constructs the distance matrix between the vectonumeric data and the
        individual BLM variables.

        Args:
            vec_data: pd.DataFrame containing the vectornumeric data. If None will
                fetch the vectornumeric data.
            single_data: dictionary containing the individual BLM data. If None
                will fetch individual BLM data.

        Returns:
            `pd.DataFrame` with as index the columns of the vector numeric data, as
                columns the blm names and contains the "distance" between each.
        """
        self._logger.info(
            "Constructing distance matrix. This will take a while...")
        start_t = time.time()
        # calculate the distance matrix
        col_diff = partial(self._multi_column_diff, single_data=single_data)
        self._logger.debug("Using %i jobs.", self._n_jobs)
        # Split the vec_data into chunks for more efficient multiprocessing
        with Pool(self._n_jobs) as p:
            res = p.imap(
                col_diff,
                enumerate(
                    chunkify([c for _, c in vec_data.iteritems()],
                             self._n_jobs)),
            )
            res = list(chain(*res))

        self._logger.info("Time elapsed: %s s", round(time.time() - start_t))
        return pd.DataFrame(res)
def ac_time(userID, startDate, endDate):
    # 最近一个月
    # 时间分布 6点~7点宿舍打卡
    # 23点到5点打卡  ACPeriodCate

    from GetJson_ACPeriodCate import GetJson_ACPeriodCate

    json_ACPeriodCate = GetJson_ACPeriodCate(userID, 2, startDate, endDate)

    if "errMsg" in json_ACPeriodCate:
        return {"count_early": -1, "count_night": -1}

    timeDistri = json_ACPeriodCate["json_timeDistribution"]
    dict_vals = {}
    for item in timeDistri["seriesData"]:
        dict_vals[item["name"]] = item["data"]

    df = DataFrame(dict_vals, index=range(24))

    df["SUM"] = 0
    for col, vals in df.iteritems():
        if col == "SUM":
            break
        df["SUM"] += vals

    count_early = df.loc[6]["dorm"] if "dorm" in df else 0  # 取 6 点宿舍值,总计早起次数
    count_night = sum(df.loc[0:6]["SUM"].tolist()) + df.loc[23]["SUM"]  # 取23点 ~ 5点总门禁次数

    return {"count_early": count_early, "count_night": count_night}
Beispiel #10
0
def get_metadata(data: pd.DataFrame, label: str, source_metadata=None):
    metadata = {}

    for count, (column, values) in enumerate(data.iteritems()):
        options = None
        data_type = values.dtype
        if column == label:
            data_type = "label"
            data[column] = data[column].astype(str)
        elif column == "id" or column == "iid":
            data_type = column
        elif data_type == "object":
            data_type = "categorical"
            options = list(set(values.tolist()))
        elif "int" in str(data_type) or "float" in str(data_type):
            data_type = "numeric"

        desc = {
            "fullname": column,
            "unit": None,
            "short": chars[count],
            "data_type": data_type,
            "options": ",".join(map(str, options)) if options else options,
        }

        metadata[column] = desc

    if source_metadata and "columns" in source_metadata:
        for col, val in source_metadata["columns"].items():
            for k, v in val.items():
                metadata[col][k] = v

    return metadata, data
Beispiel #11
0
    def fit(self, df: pd.DataFrame) -> None:
        obj = {name: series for name, series in df.iteritems()}
        for step in self._steps:
            step.fit(obj)
            obj = step.transform(obj)

        return obj
Beispiel #12
0
def df_stats(df: pd.DataFrame, top_n=5):
    total = len(df)
    stats = pd.DataFrame(index=df.columns, columns=['dtype', 'distinct-cnt', 'non-null-cnt'])

    for col_name, series in df.iteritems():
        series_dtype = series.dtype
        series_value_counts = series.value_counts()
        dist_cnt = len(series_value_counts)
        non_null_cnt = series.count()

        stats.loc[col_name] = [series_dtype, dist_cnt, non_null_cnt]

        name_str = f"{col_name}({series_dtype})".rjust(25, '-')
        dist_cnt_str = str(dist_cnt).rjust(6, ' ')
        non_null_cnt_str = str(non_null_cnt).rjust(6, ' ')

        print(f"{name_str} : count distinct - {dist_cnt_str} : "
              f"non-null/total - {non_null_cnt_str}/{total} = {non_null_cnt/total:.3f} ")
        if top_n > 0:
            col_value_count_list = [
                "'" + str(c) + "'" + ":" + str(n) for c, n in sorted(
                    series_value_counts.items(),
                    key=lambda kv: kv[1],
                    reverse=True
                )
            ]
            print(", ".join(col_value_count_list[:min(len(col_value_count_list), top_n)]))

    stats['null-cnt'] = total - stats['non-null-cnt']
    stats['non-null-ratio'] = stats['non-null-cnt'] / total
    stats['total'] = total
    return stats
Beispiel #13
0
    def get_date_trend(self, mode_date):
        """
        :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2)
        """
        axisLabels = self.oriDate[:]
        pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'}

        df = DataFrame(pointVals, index=axisLabels)
        df = df.resample(rule_mode[str(mode_date)], how='sum')
        df = df.fillna(0)

        """各项总和"""
        # cols_name = []
        # for name, col in df.iteritems():
        #     cols_name.append(name)
        # df['SUM'] = 0
        # for i in xrange(len(cols_name)):
        #     df['SUM'] += df[cols_name[i]]

        """宿舍比重"""
        # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0  # 仅当存在宿舍值时才计算宿舍比重,否则设为0

        axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist())  # 从dataframe 中取出作为索引的日期标签成为队列
        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_dateTrend
Beispiel #14
0
    def fill_old(self, df, year=None):
        """
        Takes an age, sex profile (per capita transfers) in df 
        to fill year 'year' or all years if year is None
        """
        if isinstance(df, DataFrame):
            df1 = df
        else:
            df1 = DataFrame(df)

        for col_name in df1.columns:
            if col_name not in self._types:
                self.new_type(col_name)

        if year is None:
            for yr in sorted(self.index_sets['year']):
                self.fill(df, year=yr)
        else:
            yr = year
            if isinstance(df, DataFrame):
                df1 = df
            else:
                df1 = DataFrame(df)

            for col_name, column in df1.iteritems():
                column = column.reset_index()
                column['year'] = yr
                column = column.set_index(['age', 'sex', 'year'])
                self.update(column)
Beispiel #15
0
def get_norm_metadata_dict(
    data_df: pd.DataFrame,
    exclude_features: List[str],
    feature_overrides: Dict[str, str],
    max_unique_enum_values: int,
    quantile_size: int,
    quantile_k2_threshold: int,
    skip_box_cox: int,
    skip_quantiles: int,
    skip_preprocessing: bool,
) -> Dict:
    exclude_features = set(exclude_features)
    output = {}
    for col, data in data_df.iteritems():
        if col in exclude_features:
            pass
        else:
            output[col] = _get_single_feature_norm_metadata(
                col,
                list(data),
                feature_overrides,
                max_unique_enum_values,
                quantile_size,
                quantile_k2_threshold,
                skip_box_cox,
                skip_quantiles,
                skip_preprocessing,
            )
    return output
Beispiel #16
0
def groupby_country_groups(
        df: pd.DataFrame,
        country_groups: pd.DataFrame,
        drop_elements: Optional[List[str]] = None,
        keep_elements: Optional[List[str]] = None) -> pd.DataFrame:
    new_df = []
    country_groups = country_groups.groupby(
        ['countrygroupcode', 'countrygroup'])['areacode'].apply(set)

    for group, codes in country_groups.iteritems():
        countrygroupcode, countrygroup = group

        fltrd = df[df['areacode'].isin(codes)].drop(
            columns=['area', 'areacode'])

        fltrd = fltrd.groupby(
            ['itemcode', 'item', 'elementcode', 'element', 'unit',
             'year'])['value'].apply(list).reset_index()
        fltrd = fltrd.assign(areacode=countrygroupcode).assign(
            area=countrygroup)

        fltrd = fltrd.assign(
            flag=fltrd['value'].apply(lambda x: get_flag(x, codes)))
        fltrd['value'] = fltrd['value'].apply(np.nansum)
        new_df.append(fltrd)

    df = pd.concat(new_df, sort=False).reset_index(drop=True)
    if drop_elements is not None:
        df = df[~df.elementcode.isin(drop_elements)]

    if keep_elements is not None:
        df = df[df.elementcode.isin(keep_elements)]
    df['year'] = df.year.astype('int')
    return df
Beispiel #17
0
def insert_defaults(table: pd.DataFrame, variables: list):
    '''Replaces null values (None/NaN) in a DataFrame with the corresponding
    default value from the variables.
    If a default value is not found (is None) the null is not changed
    Parameters
        table:
                Required,   Type DataFrame,
                The table to be searched.
        variables:
                Required,   Type dict,
                The dictionary of Variable objects containing the default values.
    Returns
        updated_table:
            A copy of the table DataFrame with null values replaced with
            default values.
    '''
    #TODO make insert_defaults a Table method
    #TODO there is probably significant room for optimization
    updated_table = table.copy()
    # select columns with specified variables
    table_variables = set(table.columns.values)
    for (var_name, data) in table.iteritems():
        var = variables.get(var_name)
        if var is not None:
            if var.default is not None:
                updated_table[var_name] = data.fillna(value=var.default)
    return updated_table
Beispiel #18
0
def our_mean_std(df: pd.DataFrame):
    res = {}
    for col, xs in df.iteritems():
        xs = xs.values
        max_length = max(len(x) for x in xs)
        masks = [
            np.concatenate([np.ones(len(x)),
                            np.zeros(max_length - len(x))]) for x in xs
        ]
        xs = [
            np.concatenate([np.array(x),
                            np.zeros(max_length - len(x))]) for x in xs
        ]
        xs = np.concatenate([x[None, :] for x in xs], axis=0)
        masks = np.concatenate([mask[None, :] for mask in masks], axis=0)
        count = np.sum(masks, axis=0)
        mean = np.sum(xs, axis=0) / count
        xs -= mean[None, :]
        masks = masks.astype(np.bool)
        xs[~masks] = 0
        res[f'{col}_count'] = count.tolist()
        count = count - 1
        single = count == 0
        count[single] = 1
        std = np.sqrt(np.sum(xs**2, axis=0) / count)
        std[single] = 0
        res[f'{col}_mean'] = mean.tolist()
        res[f'{col}_std'] = std.tolist()
    return pd.Series(res)
Beispiel #19
0
def normalize(train: pn.DataFrame, test: pn.DataFrame):
    for (columnName, columnData) in train.iteritems():
        max, min = train[columnName].max(), train[columnName].min()
        train[columnName] = (
            (train[columnName] - train[columnName].min()) /
            (train[columnName].max() - train[columnName].min()))
        test[columnName] = ((test[columnName] - min) / (max - min))
Beispiel #20
0
    def fill_old(self, df, year = None):
        """
        Takes an age, sex profile (per capita transfers) in df 
        to fill year 'year' or all years if year is None
        """
        if isinstance(df, DataFrame):
            df1  = df 
        else:
            df1 = DataFrame(df)

        for col_name in df1.columns:
            if col_name not in self._types:
                self.new_type(col_name)

        if year is None:
            for yr in sorted(self.index_sets['year']):
                self.fill(df, year = yr)
        else:
            yr = year
            if isinstance(df, DataFrame):
                df1  = df 
            else:
                df1 = DataFrame(df)
            
            for col_name, column in df1.iteritems():
                column = column.reset_index()
                column['year'] = yr
                column = column.set_index(['age','sex','year'])
                self.update(column)
Beispiel #21
0
 def _fit_catboost(
     self,
     X: pd.DataFrame,
     y: pd.Series,
     eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
     tree_params: Optional[Dict[str, Any]] = None,
     fit_params: Optional[Dict[str, Any]] = None,
 ) -> 'catboost.CatBoostClassifier':
     if catboost is None:
         raise ImportError('catboost is not installed.')
     # Default settings
     if tree_params is None:
         tree_params = dict(eval_metric='BrierScore',
                            loss_function='Logloss',
                            iterations=100)
     if fit_params is None:
         is_cat_feature = [
             c.dtype.name == 'category' for (_, c) in X.iteritems()
         ]
         fit_params = dict(
             cat_features=np.nonzero(is_cat_feature)[0].tolist(),
             verbose=True,
         )
     if eval_set is not None:
         val_params = dict(early_stopping_rounds=10, eval_set=eval_set)
         fit_params = {**fit_params, **val_params}
     # Train the model
     model = catboost.CatBoostClassifier(**tree_params)
     return model.fit(X, y, **fit_params)
Beispiel #22
0
def prepare_data(
    dataset_df: pd.DataFrame,
    drop_na: bool = False,
    mean_int: bool = True,
    mean_float: bool = True,
    rescale_float: bool = True,
    standardize_float: bool = True,
) -> None:
    """Fill missing values and standardize float columns.

    :author: Robin Courant
    :param dataset_df: dataset to process.
    :param drop_na: whether to drop every row with at least on `NaN` cell.
    :param mean_int: whether to use mean or the median for missing integers.
    :param mean_float: whether to use mean or the median for missing floats.
    :param rescale_float: whether to rescale floats (standardize or normalize).
    :param standardize_float: whether to apply standardization or normalization.
    """
    if drop_na:
        dataset_df.dropna()
        return

    for column_name, column_series in dataset_df.iteritems():
        if is_integer_dtype(column_series):
            if set(column_series.unique()) == {0, 1}:
                dataset_df[column_name] = _prepare_bool(column_series)
            else:
                dataset_df[column_name] = _prepare_int(column_series, mean_int)
        elif is_float_dtype(column_series):
            dataset_df[column_name] = _prepare_float(column_series, mean_float,
                                                     rescale_float,
                                                     standardize_float)
        # Raise an error is the column's type is not boolean, integer or float
        else:
            raise TypeError(f"Unrecognized type, column: {column_name}")
Beispiel #23
0
    def code_categories(self, data: DataFrame, encoder) -> Tuple[DataFrame, Dict[str, Dict]]:
        """Encoding categorical parameters

            Args:
                data (DataFrame): input dataset
                encoder: any object with fit_transform method

            Returns:
                pd.DataFrame: output dataset with encoded parameters
                dict: dictionary with values and codes
            """
        columns = [col for col in data.columns.to_list() if self.nodes_types[col] == 'disc']
        df = data.copy()  # INPUT DF. Debugging SettingWithCopyWarning
        if not columns:
            return df, None
        data = df[columns]  # DATA TO CATEGORIZE
        encoder_dict = dict()

        for col_name, column in data.iteritems():
            # Iterate over (column name, Series) pairs.
            try:
                df[col_name] = encoder.fit_transform(column.values)
            except TypeError as exc:
                logger_preprocessor.error(f"Wrond data types on {col_name} ({df[col_name].dtypes}). Message: {exc}")
            try:
                mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
                encoder_dict[col_name] = mapping
            except:
                pass
        return df, encoder_dict
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel):
    """
    Bert Inference for prediction.
    :param tweet: dataframe with tweets
    :param model: Bert Model
    :return: list of pr
    """
    tweet = tweet.values.tolist()
    try:
        predictions, raw_outputs = model.predict(tweet)
    except:
        for element in tweet.iteritems():
            model.predict([element])
        print("STOPP")
    auswertung = collections.Counter(predictions)
    gc.collect()

    # df = pd.DataFrame(raw_outputs)
    # df['predictions'] = pd.DataFrame(predictions)
    # df['tweets'] = pd.DataFrame(tweet)
    # df = df.replace(r'\n', ' ', regex=True)
    # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1))
    # df['softmax0'] = df_softmax[0]
    # df['softmax1'] = df_softmax[1]
    # db_functions.df_to_sql(df, 'temp_table', 'replace')

    return auswertung
Beispiel #25
0
def roc_analyze(classes: pandas.DataFrame, norm_data: pandas.DataFrame) -> Dict[str, ROC_curve_data]:
    result = {}
    for name, column in norm_data.iteritems():
        fpr, tpr, threshold = metrics.roc_curve(classes, column)
        roc_auc = metrics.auc(fpr, tpr)
        result[name] = ROC_curve_data(fpr=fpr, tpr=tpr, threshold=threshold, auc=roc_auc)
    return result
def write_arff_file(dataset: pd.DataFrame,
                    filename="dataset.arff",
                    name="Universities"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(f"@RELATION {name}\n\n")
        max_len = len(max(dataset.columns, key=len))
        for header in dataset.columns:
            if dataset[header].dtype == np.float64 or dataset[
                    header].dtype == np.int64:
                column_type = "NUMERIC"
            else:
                column_type = "STRING"

            file.write(f"@ATTRIBUTE {header.ljust(max_len)} {column_type}\n")
        file.write("\n@DATA\n")

        for _, column in dataset.iteritems():
            if column.dtype == np.object:
                pattern = re.compile(r"^(.*)$")
                dataset[column.name] = column.str.replace(pattern, r'"\1"')

        for _, row in dataset.iterrows():
            items = [str(x) for x in row]
            items = [x if x != "nan" else "?" for x in items]
            file.write(f"{', '.join(items)}\n")
Beispiel #27
0
def convert_units(df: pd.DataFrame) -> pd.DataFrame:
    """Change units of measurement from source to destination standards"""

    rename = dict()

    for col, s in df.iteritems():
        (observed_property, unit_of_measurement) = col

        try:
            conversion = settings.UNIT_MAP[unit_of_measurement][
                observed_property]
        except KeyError:
            LOGGER.error(col)
            raise

        # Map column names
        rename[col] = conversion['label']

        # Calculate conversion
        factor = float(conversion['factor'])
        s = s.mul(factor)

        df[col] = s

    df = df.rename(columns=rename, errors='raise')

    return df
Beispiel #28
0
def get_product_parent(src: pd.DataFrame) -> tuple:
    src = src['product_parent'].value_counts()
    _product_parent = list()
    _review_count = list()
    for p, r in src.iteritems():
        _product_parent.append(p)
        _review_count.append(r)
    return _product_parent, _review_count
Beispiel #29
0
 def __get_best_attribute(self, x: pandas.DataFrame, y: pandas.Series) -> str:
     x_entropy = self.__get_entropy(y)
     information_gains = pandas.Series([])
     for attribute, series in x.iteritems():
         attribute_entropy = [(y[series == value].size / y.size) * self.__get_entropy(y[series == value])
                              for value in series.unique()]
         information_gains[attribute] = x_entropy - sum(attribute_entropy)
     return information_gains.idxmax()
Beispiel #30
0
def get_unique_elements(df: pd.DataFrame) -> np.ndarray:
    """Returns all unique elements found in a multiple
    sequence alignment."""
    U = np.array([])
    for name, seq in df.iteritems():
        U = np.append(U, seq.unique())

    return np.unique(U)
Beispiel #31
0
 def asset_beta(self, df: pandas.DataFrame, market_asset: str):
     import numpy as np
     beta_matrix = {}
     for index, col in df.iteritems():
         beta = df[[index, market_asset
                    ]].cov().iloc[0, 1] / df[market_asset].var()
         beta_matrix[index] = beta
     return beta_matrix
Beispiel #32
0
def aggregate_review(review_list):
    """
    レビュー記録を集計
    """
    aggregate = {}
    df = DataFrame(review_list)
    for col, item in df.iteritems():
        aggregate[col] = df.groupby(col).size().to_dict()
    return aggregate
Beispiel #33
0
 def sharpe_ratio(self, df: pandas.DataFrame, market_asset: str):
     import numpy as np
     sharpe_matrix = {}
     for index, col in df.iteritems():
         sharpe_ratio = np.sqrt(250) * (
             df[index].mean() -
             self.risk_free_rate / 250) / df[index].std()
         sharpe_matrix[index] = sharpe_ratio
     return sharpe_matrix
def drop_outliers(features: pd.DataFrame) -> pd.DataFrame:
    outliers = set()
    for col, vals in features.iteritems():
        lower, med, upper = np.percentile(vals, [25, 50, 75])
        scale = np.abs(upper - lower)
        outliers.update(vals[(vals < med - 3 * scale) |
                             (vals > med + 3 * scale)].index)
    LOG.info("%d outliers removed", len(outliers))
    return features.drop(outliers, axis=0)
def rolling_mean(data, window, min_periods=1, center=False):
    ''' Function that computes a rolling mean

    Parameters
    ----------
    data : DataFrame or Series
           If a DataFrame is passed, the rolling_mean is computed for all columns.
    window : int or string
             If int is passed, window is the number of observations used for calculating 
             the statistic, as defined by the function pd.rolling_mean()
             If a string is passed, it must be a frequency string, e.g. '90S'. This is
             internally converted into a DateOffset object, representing the window size.
    min_periods : int
                  Minimum number of observations in window required to have a value.

    Returns
    -------
    Series or DataFrame, if more than one column    
    '''
    def f(x):
        '''Function to apply that actually computes the rolling mean'''
        if center == False:
            dslice = col[x-pd.datetools.to_offset(window).delta+timedelta(0,0,1):x]
                # adding a microsecond because when slicing with labels start and endpoint
                # are inclusive
        else:
            dslice = col[x-pd.datetools.to_offset(window).delta/2+timedelta(0,0,1):
                         x+pd.datetools.to_offset(window).delta/2]
        if dslice.size < min_periods:
            return np.nan
        else:
            return dslice.mean()

    data = DataFrame(data.copy())
    dfout = DataFrame()
    if isinstance(window, int):
        dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center)
    elif isinstance(window, basestring):
        idx = Series(data.index.to_pydatetime(), index=data.index)
        for colname, col in data.iteritems():
            result = idx.apply(f)
            result.name = colname
            dfout = dfout.join(result, how='outer')
    if dfout.columns.size == 1:
        dfout = dfout.ix[:,0]
    return dfout
Beispiel #36
0
    def test_sequence_like_with_categorical(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df['grade'] = Categorical(df['raw_grade'])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.iteritems():
            str(s)
Beispiel #37
0
    def get_time_distribution(self):
        dates = self.oriDate[:]
        values = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        # 生成时间点和时间标签队列。
        periods = []
        axisLabels = []
        for i in xrange(24):
            periods.append(time(i))
            axisLabels.append(str(i) + u'点~' + str((i + 1) % 24) + u'点')

        # 时间点队列 -> 时间区间队列。
        periodRanges = []
        for i in xrange(len(periods)):
            periodRange = [periods[i], periods[(i + 1) % len(periods)]]
            periodRanges.append(periodRange)

        lTimes = map(lambda d: d.time(), dates)  # Keep time.
        vals = []  # Init vals
        for i in xrange(len(periods)):
            vals.append({})

        # Add to total vals.
        for i in xrange(len(lTimes)):
            for j in xrange(len(periodRanges)):
                if periodRanges[j][0] <= lTimes[i] < periodRanges[j][1]:
                    vals[j + 1] = helpers.mergeDict(vals[j + 1], values[i])

        df = DataFrame(vals)

        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0 if isnan(x) else int(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_timeDistribution = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_timeDistribution
def upsert_unique_indices(apps, schema_editor):
    datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date']
    historical_dps = DataFrame(list(DataPoint.objects.all()\
        .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list)
    # create the unique index
    historical_dps = historical_dps.apply(add_unique_index, axis=1)

    # group by and max on created at, get the most recent upload
    historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first()

    # get the ids into a list and select them
    dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id']))
    print 'dps to update'
    print len(dps_to_update)
    # then run a query and update each
    for dp in dps_to_update:
        unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index']
        dp.unique_index = unique_index
        dp.save()
    
    # delete all the other duplicates
    dps_to_delete = DataPoint.objects.all().exclude(id__in=list(historical_dps['id']))
    print 'dps_to_delete'
    print len(dps_to_delete)
    dps_to_delete.delete()


    dataframe_columns = ['id','created_at','indicator_id','location_id','campaign_id','data_date', 'unique_index']
    
    # make sure there aren't duplicate dps now.
    all_dps = DataFrame(list(DataPoint.objects.all()\
        .values_list('unique_index')), columns=['unique_index'])

    all_dps = all_dps.groupby('unique_index').size()

    for idx, dp in all_dps.iteritems():
    	if dp != 1:
    		raise Exception("there are duplicate datapoints")
Beispiel #39
0
def run_clinical_real(cancer, clinical, data_path, gene_sets,
                      survival_tests, real_variables, binary_variables,
                      data_type='expression', drop_pc=False):
    
    if data_type == 'expression':
        data_matrix = read_rnaSeq(cancer, data_path)
        data_matrix = data_matrix.groupby(by=lambda n: n.split('|')[0]).mean()
    elif data_type == 'expression_array':
        data_matrix = read_mrna(cancer, data_path)
    elif data_type == 'methylation':
        data_matrix = read_methylation(cancer, data_path)
    if drop_pc:
        data_matrix = drop_first_norm_pc(data_matrix)
    pc = dict((p, extract_pc(data_matrix.ix[g])) for p, g in 
              gene_sets.iteritems())
    pc = DataFrame(dict((p, (v - v.mean()) / v.std()) for p,v in pc.iteritems() if 
                   type(v) != type(None))).T
    #clinical['pc'] = extract_pc(data_matrix.dropna(), pc_threshold=0)
    tests  = get_tests(clinical, survival_tests, real_variables, 
                       binary_variables, var_type='real')
    #return locals()
    p_pathways, q_pathways = run_tests(tests, pc)
    return locals()
Beispiel #40
0
class FrameParser(Parser):
    _default_orient = 'columns'
    _split_keys = ('columns', 'index', 'data')

    def _parse_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            args = loads(json, dtype=None, numpy=True, labelled=True,
                         precise_float=self.precise_float)
            if args:
                args = (args[0].T, args[2], args[1])
            self.obj = DataFrame(*args)
        elif orient == "split":
            decoded = loads(json, dtype=None, numpy=True,
                            precise_float=self.precise_float)
            decoded = dict((str(k), v) for k, v in compat.iteritems(decoded))
            self.check_keys_split(decoded)
            self.obj = DataFrame(**decoded)
        elif orient == "values":
            self.obj = DataFrame(loads(json, dtype=None, numpy=True,
                                       precise_float=self.precise_float))
        else:
            self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
                                        labelled=True,
                                        precise_float=self.precise_float))

    def _parse_no_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None)
        elif orient == "split":
            decoded = dict((str(k), v)
                           for k, v in compat.iteritems(loads(
                               json,
                               precise_float=self.precise_float)))
            self.check_keys_split(decoded)
            self.obj = DataFrame(dtype=None, **decoded)
        elif orient == "index":
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None).T
        else:
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None)

    def _process_converter(self, f, filt=None):
        """ take a conversion function and possibly recreate the frame """

        if filt is None:
            filt = lambda col, c: True

        needs_new_obj = False
        new_obj = dict()
        for i, (col, c) in enumerate(self.obj.iteritems()):
            if filt(col, c):
                new_data, result = f(col, c)
                if result:
                    c = new_data
                    needs_new_obj = True
            new_obj[i] = c

        if needs_new_obj:

            # possibly handle dup columns
            new_obj = DataFrame(new_obj, index=self.obj.index)
            new_obj.columns = self.obj.columns
            self.obj = new_obj

    def _try_convert_types(self):
        if self.obj is None:
            return
        if self.convert_dates:
            self._try_convert_dates()

        self._process_converter(
            lambda col, c: self._try_convert_data(col, c, convert_dates=False))

    def _try_convert_dates(self):
        if self.obj is None:
            return

        # our columns to parse
        convert_dates = self.convert_dates
        if convert_dates is True:
            convert_dates = []
        convert_dates = set(convert_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, compat.string_types):
                return False

            col_lower = col.lower()
            if (col_lower.endswith('_at') or
                    col_lower.endswith('_time') or
                    col_lower == 'modified' or
                    col_lower == 'date' or
                    col_lower == 'datetime' or
                    col_lower.startswith('timestamp')):
                return True
            return False

        self._process_converter(
            lambda col, c: self._try_convert_to_date(c),
            lambda col, c: ((self.keep_default_dates and is_ok(col)) or
                            col in convert_dates))
def get_sensitivity_analysis(extracts, points, statics, initials, pickle=None):

    temps = range(-5, 6)
    all_pct = [x * 0.1 for x in range(5, 16)]
    ndvi_range = linspace(0.9, 1.7, 11)
    ndvi_range = array([round_to_value(x, 0.05) for x in ndvi_range])
    var_arrs = []
    y = 0
    for x in range(0, 6):
        ones_ = ones((5, 11), dtype=float)
        zeros = [x * 0.0 for x in range(5, 16)]
        norm_ndvi = array([1.25 for x in zeros])
        if y == 0:
            arr = insert(ones_, y, temps, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            arr = []
        elif y == 4:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, y, ndvi_range, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            print 'shape arr: {}'.format(arr.shape)
            arr = []
        elif y == 5:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:5]
            arr = insert(arr, y, all_pct, axis=0)
            var_arrs.append(arr)
            arr = []
        else:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, y, all_pct, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            arr = []
        y += 1

    print 'variable arrays: {}'.format(var_arrs)
    normalize_list = [2, 0.20, 0.20, 2, 0.20, 0.50]

    # site_list = ['Bateman', 'Navajo_Whiskey_Ck', 'Quemazon', 'Sierra_Blanca', 'SB_1', 'SB_2', 'SB_4', 'SB_5', 'VC_1',
    #              'VC_2', 'VC_3', 'CH_1', 'CH_3', 'MG_1', 'MG_2', 'WHLR_PK', 'LP', 'South_Baldy',
    #              'Water_Canyon', 'La_Jencia', 'Socorro']

    site_list = ['Sierra_Blanca', 'Great_Western_Mine', 'Bonito', 'Nogal']
    df = DataFrame(columns=FACTORS, index=site_list)
    df_norm = DataFrame(columns=FACTORS, index=site_list)

    site_dict = {'Sierra_Blanca': {}, 'Great_Western_Mine': {}, 'Bonito': {}, 'Nogal': {}}
    ds = Open(points)
    lyr = ds.GetLayer()
    # defs = lyr.GetLayerDefn()

    for j, feat in enumerate(lyr):
        name = feat.GetField("Name")
        name = name.replace(' ', '_')
        geom = feat.GetGeometryRef()
        mx, my = int(geom.GetX()), int(geom.GetY())
        site_dict[name]['Coords'] = '{} {}'.format(mx, my)
        file_name = os.path.join(extracts, '{}.csv'.format(name))
        print file_name
        site_dict[name]['etrm'] = get_etrm_time_series(file_name, single_file=True)

    # print 'site dict before running etrm: {}'.format(site_dict)

    for i, var_arr in enumerate(var_arrs):
        factor = FACTORS[i]
        print 'running modified factor: {}'.format(factor)
        print ''
        for key, val in site_dict.iteritems():
            print '\n site: {} \n '.format(key)
            results = []
            for col in var_arr.T:
                etrm = Processes(SIMULATION_PERIOD, static_inputs=statics, initial_inputs=initials,
                                 output_root=pickle, point_dict=site_dict)
                tracker = etrm.run(point_dict=site_dict, point_dict_key=key, sensitivity_matrix_column=col,
                                   sensitivity=True)

                # print 'tracker: {}'.format(tracker)
                results.append(tracker['tot_infil'][-1])
                print 'total infil: {} \n results: {}'.format(tracker['tot_infil'][-1], results)

            df.iloc[site_list.index(key), FACTORS.index(factor)] = divide(array(results), 14.0)
        print 'df after site {}: \n {}'.format(key, df)
    print 'df: {}'.format(df)

    # tot_data : precip, et, tot_transp, tot_evap, infil, runoff, snow_fall, cum_mass, end_mass

    # "SI = [Q(Po + delP] -Q(Po - delP] / (2 * delP)"
    # where SI = Sensitivity Index, Q = recharge, Po = base value of input parameter,
    # delP = change in value input
    # find sensitivity index

    xx = 0
    for param in df.iteritems():
        data_cube = param[1]
        var_arr = var_arrs[xx]
        yy = 0
        for site in data_cube:
            site_name = site_list[yy]
            normal = normalize_list[xx]
            site_obj = [x for x in site]
            sens_list = []
            zz = 0
            for var in var_arr[xx]:
                if var != var_arr[xx][5]:
                    base = var_arr[xx][5]
                    deltap = var - base
                    obj = site_obj[zz]
                    sen = ((obj * (base + deltap) - obj * (base - deltap)) / (2 * deltap)) * normal
                    sens_list.append(sen)
                    zz += 1
            sens_list = array(sens_list)
            df_norm.iloc[site_list.index(site_name), FACTORS.index(param[0])] = sens_list
            if yy == 20:
                print 'done'
                break
            yy += 1
        xx += 1

    # why not save the data as pickle, so we don't have to do the analysis each time
    # we debug the plotting

    df.to_pickle(os.path.join(pickle, '_basic_sensitivity_2.pkl'))
    df_norm.to_pickle(os.path.join(pickle, 'norm_sensitivity_2.pkl'))
Beispiel #42
0
class FrameParser(Parser):
    _default_orient = 'columns'

    def _parse(self):

        json = self.json
        dtype = self.dtype
        orient = self.orient
        numpy = self.numpy

        if numpy:
            try:
                if orient == "columns":
                    args = loads(json, dtype=dtype, numpy=True, labelled=True)
                    if args:
                        args = (args[0].T, args[2], args[1])
                    self.obj = DataFrame(*args)
                elif orient == "split":
                    decoded = loads(json, dtype=dtype, numpy=True)
                    decoded = dict((str(k), v) for k, v in decoded.iteritems())
                    self.obj = DataFrame(**decoded)
                elif orient == "values":
                    self.obj = DataFrame(loads(json, dtype=dtype, numpy=True))
                else:
                    self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True,
                                         labelled=True))
            except ValueError:
                numpy = False

        if not numpy:
            if orient == "columns":
                self.obj = DataFrame(loads(json), dtype=dtype)
            elif orient == "split":
                decoded = dict((str(k), v)
                               for k, v in loads(json).iteritems())
                self.obj = DataFrame(dtype=dtype, **decoded)
            elif orient == "index":
                self.obj = DataFrame(loads(json), dtype=dtype).T
            else:
                self.obj = DataFrame(loads(json), dtype=dtype)

    def _convert_axes(self):
        """ try to axes if they are datelike """
        if self.orient == 'columns':
            axis = 'index'
        elif self.orient == 'index':
            axis = 'columns'
        else:
            return

        try:
            a = getattr(self.obj,axis)
            setattr(self.obj,axis,self._try_parse_to_date(a))
        except:
            pass

    def _try_parse_dates(self):
        if self.obj is None: return

        # our columns to parse
        parse_dates = self.parse_dates
        if parse_dates is True:
            parse_dates = []
        parse_dates = set(parse_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, basestring): return False

            if (col.endswith('_at') or
                col.endswith('_time') or
                col.lower() == 'modified' or
                col.lower() == 'date' or
                col.lower() == 'datetime'):
                    return True
            return False


        for col, c in self.obj.iteritems():
            if (self.keep_default_dates and is_ok(col)) or col in parse_dates:
                self.obj[col] = self._try_parse_to_date(c)
Beispiel #43
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Beispiel #44
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex()]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_errors(self):

        for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        self.assertRaises(ValueError, f)

    def test_unsupported_objects(self):

        # mixed objects are not supported
        obj = Series(['1', 2, 3])

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

        # MultiIndex are represented as tuples
        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
            [('a', 1), ('a', 2), ('b', 1)]))

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_long_strings(self):

        obj = Index(tm.rands_array(nchars=10000, size=100))
        self.check_equal(obj)
Beispiel #45
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex()]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3)

    def test_errors(self):

        for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        self.assertRaises(ValueError, f)

    def test_unsupported_objects(self):

        # mixed objects are not supported
        obj = Series(['1', 2, 3])

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

        # MultiIndex are represented as tuples
        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
            [('a', 1), ('a', 2), ('b', 1)]))

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        self.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        self.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        self.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
    def train(self, file_name=DEFAULT_TRAINING_FILE):
        """
        Takes in a training file formatted where the entire body of text is
        parsed each individual word or punctuation. On each line, there is
        also an identification stating the actual part of speech for that word.

        Args:
            file_name (str): The path to the file contain the training data
        """
        # Create connection to the file
        training_file = None

        # Open the specific training file
        if os.path.exists(file_name):
            try:
                training_file = open(file_name)
            except IOError:
                print("Unable to open the file at " + file_name + ".")

        # Pull probablistic data from these files
        tags = array_column(self.tags, "Tag")
        vocabulary = set()

        # Pandas copies memory over for appending so must iterate for unique
        # words first
        for line in training_file:
            # Ensure this doesn't just return an empty line
            line = line.strip()
            if len(line) > 0:
                # Parse line into 'observation/classification'
                words = line.split(' ')

                # Iterate over each word to get the word and classification
                for word in words:
                    # Separate into tag & classification
                    context = word.rsplit('/', maxsplit=1)
                    word = context[0].lower().strip()
                    if word not in vocabulary:
                        vocabulary.add(word)

        # Prepare necessary data structures
        emission = DataFrame(index=vocabulary, columns=tags)
        transition = DataFrame(index=tags, columns=tags)
        emission.fillna(0, inplace=True)
        transition.fillna(0, inplace=True)
        last_class = None

        # Iterate to update the emissions and
        training_file.seek(0)
        for line in training_file:
            # Ensure this doesn't just return an empty line
            line = line.strip()
            if len(line) > 0:
                # Parse line into 'observation/classification'
                words = line.split(' ')

                # Iterate over each word to get the word and classification
                for word in words:
                    # Separate into tag & classification
                    context = word.rsplit('/', maxsplit=1)
                    word = context[0].lower().strip()
                    context_tags = context[1].split('+')

                    # Update the emission matrix
                    for context_tag in context_tags:
                        emission[context_tag][word] += 1

                    # Update the transition
                    if last_class != None:
                        for context_tag in context_tags:
                            for last_tag in last_class:
                                transition[last_tag][context_tag] += 1

                    # Update the last_class
                    last_class = context_tags

        # Pull info from the database that needs to be updated & merge arrays
        cursor = self.connection.cursor()
        word_totals = {}
        tag_totals = {}
        for dest_tag, row in transition.iteritems():
            for origin_tag, occurrence in row.iteritems():
                # Retrieve total occurence data if it could not be found
                if origin_tag not in tag_totals:
                    cursor.execute('SELECT TotalOccurrences FROM Tags WHERE Tag = ?', (origin_tag.upper().strip(),))
                    tag_totals[origin_tag] = cursor.fetchone()
                    if tag_totals[origin_tag] == None:
                        cursor.execute('INSERT INTO Tags (Tag, TotalOccurrences) VALUES (?, ?)', (origin_tag.upper().strip(), 1))
                        tag_totals[origin_tag] = 1
                    else:
                        tag_totals[origin_tag] = int(tag_totals[origin_tag]['TotalOccurrences'])
                tag_totals[origin_tag] += int(occurrence)

                # Grab data for this specific transition
                cursor.execute('SELECT Occurrences FROM Transitions WHERE OriginTag = ? AND DestTag = ?', (origin_tag.upper().strip(), dest_tag))
                db_occurrence = cursor.fetchone()
                if db_occurrence == None:
                    # We need to add one if it doesn't exists
                    cursor.execute('INSERT INTO Transitions (OriginTag, DestTag, Occurrences) VALUES (?, ?, ?)', (origin_tag.upper().strip(), dest_tag, 0))
                    db_occurrence = 0
                else:
                    db_occurrence = db_occurrence['Occurrences']

                # Update the data
                db_occurrence += int(occurrence)
                cursor.execute('UPDATE Transitions SET Occurrences = ? WHERE OriginTag = ? AND DestTag = ?', (int(db_occurrence), origin_tag.upper().strip(), dest_tag))

        for tag, row in emission.iteritems():
            for word, occurrence in row.iteritems():
                # Retrieve total occurence data if it could not be found
                if word not in word_totals:
                    cursor.execute('SELECT TotalOccurrences FROM Words WHERE Word = ?', (word,))
                    word_totals[word] = cursor.fetchone()
                    if word_totals[word] == None:
                        cursor.execute('INSERT INTO Words (Word, TotalOccurrences) VALUES (?, ?)', (word, 1))
                        word_totals[word] = 1
                    else:
                        word_totals[word] = int(word_totals[word]['TotalOccurrences'])
                word_totals[word] += int(occurrence)

                # Grab data for this specific emission
                cursor.execute('SELECT Occurrences FROM Emissions WHERE Word = ? AND Tag = ?', (word, tag.upper().strip()))
                db_occurrence = cursor.fetchone()
                if db_occurrence == None:
                    # We need to add the entry
                    cursor.execute('INSERT INTO Emissions (Word, Tag, Occurrences) VALUES (?, ?, ?)', (word, tag.upper().strip(), 0))
                    db_occurrence = 0
                else:
                    db_occurrence = db_occurrence['Occurrences']

                # Update the data
                db_occurrence += int(occurrence)
                cursor.execute('UPDATE Emissions SET Occurrences = ? WHERE Word = ? AND Tag = ?', (int(db_occurrence), word, tag.upper().strip()))

        # Update totals in general
        for word, occurence in word_totals.items():
            cursor.execute('UPDATE Words SET TotalOccurrences = ? WHERE Word = ?', (int(occurence), word))
        for tag, occurence in tag_totals.items():
            cursor.execute('UPDATE Tags SET TotalOccurrences = ? WHERE tag = ?', (int(occurence), (tag.upper().strip())))

        # Close unnecessary resources
        cursor.close()
        self.connection.commit()
        training_file.close()
Beispiel #47
0
class TestHashing(object):

    def setup_method(self, method):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def test_hash_array_mixed(self):
        result1 = hash_array(np.array([3, 4, 'All']))
        result2 = hash_array(np.array(['3', '4', 'All']))
        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
        tm.assert_numpy_array_equal(result1, result2)
        tm.assert_numpy_array_equal(result1, result3)

    def test_hash_array_errors(self):

        for val in [5, 'foo', pd.Timestamp('20130101')]:
            pytest.raises(TypeError, hash_array, val)

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()

    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]

    def test_hash_tuple(self):
        # test equivalence between hash_tuples and hash_tuple
        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
                    ('A', pd.Timestamp("2012-01-01"))]:
            result = hash_tuple(tup)
            expected = hash_tuples([tup])[0]
            assert result == expected

    def test_hash_scalar(self):
        for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
                    pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
                    datetime.datetime(2012, 1, 1),
                    pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
                    pd.Timedelta('1 days'), datetime.timedelta(1),
                    pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
                    np.nan, pd.NaT, None]:
            result = _hash_scalar(val)
            expected = hash_array(np.array([val], dtype=object),
                                  categorize=True)
            assert result[0] == expected[0]

    def test_hash_tuples_err(self):

        for val in [5, 'foo', pd.Timestamp('20130101')]:
            pytest.raises(TypeError, hash_tuples, val)

    def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                     (51, 204), (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique

    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(
            mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(
            recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result),
                                    np.sort(expected))

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Series(),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    DataFrame(),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex(),
                    tm.makePeriodIndex(),
                    Series(tm.makePeriodIndex()),
                    Series(pd.date_range('20130101',
                                         periods=3, tz='US/Eastern')),
                    MultiIndex.from_product(
                        [range(5),
                         ['foo', 'bar', 'baz'],
                         pd.date_range('20130101', periods=2)]),
                    MultiIndex.from_product(
                        [pd.CategoricalIndex(list('aabc')),
                         range(3)])]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3)

    def test_categorical_with_nan_consistency(self):
        c = pd.Categorical.from_codes(
            [-1, 0, 1, 2, 3, 4],
            categories=pd.date_range('2012-01-01', periods=5, name='B'))
        expected = hash_array(c, categorize=False)
        c = pd.Categorical.from_codes(
            [-1, 0],
            categories=[pd.Timestamp('2012-01-01')])
        result = hash_array(c, categorize=False)
        assert result[0] in expected
        assert result[1] in expected

    def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        pytest.raises(ValueError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        tm.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))