Example #1
0
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Example #2
0
    def get_date_trend(self, mode_date):
        """
        :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2)
        """
        axisLabels = self.oriDate[:]
        pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'}

        df = DataFrame(pointVals, index=axisLabels)
        df = df.resample(rule_mode[str(mode_date)], how='sum')
        df = df.fillna(0)

        """各项总和"""
        # cols_name = []
        # for name, col in df.iteritems():
        #     cols_name.append(name)
        # df['SUM'] = 0
        # for i in xrange(len(cols_name)):
        #     df['SUM'] += df[cols_name[i]]

        """宿舍比重"""
        # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0  # 仅当存在宿舍值时才计算宿舍比重,否则设为0

        axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist())  # 从dataframe 中取出作为索引的日期标签成为队列
        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_dateTrend
Example #3
0
def ac_time(userID, startDate, endDate):
    # 最近一个月
    # 时间分布 6点~7点宿舍打卡
    # 23点到5点打卡  ACPeriodCate

    from GetJson_ACPeriodCate import GetJson_ACPeriodCate

    json_ACPeriodCate = GetJson_ACPeriodCate(userID, 2, startDate, endDate)

    if "errMsg" in json_ACPeriodCate:
        return {"count_early": -1, "count_night": -1}

    timeDistri = json_ACPeriodCate["json_timeDistribution"]
    dict_vals = {}
    for item in timeDistri["seriesData"]:
        dict_vals[item["name"]] = item["data"]

    df = DataFrame(dict_vals, index=range(24))

    df["SUM"] = 0
    for col, vals in df.iteritems():
        if col == "SUM":
            break
        df["SUM"] += vals

    count_early = df.loc[6]["dorm"] if "dorm" in df else 0  # 取 6 点宿舍值,总计早起次数
    count_night = sum(df.loc[0:6]["SUM"].tolist()) + df.loc[23]["SUM"]  # 取23点 ~ 5点总门禁次数

    return {"count_early": count_early, "count_night": count_night}
Example #4
0
    def fill_old(self, df, year = None):
        """
        Takes an age, sex profile (per capita transfers) in df 
        to fill year 'year' or all years if year is None
        """
        if isinstance(df, DataFrame):
            df1  = df 
        else:
            df1 = DataFrame(df)

        for col_name in df1.columns:
            if col_name not in self._types:
                self.new_type(col_name)

        if year is None:
            for yr in sorted(self.index_sets['year']):
                self.fill(df, year = yr)
        else:
            yr = year
            if isinstance(df, DataFrame):
                df1  = df 
            else:
                df1 = DataFrame(df)
            
            for col_name, column in df1.iteritems():
                column = column.reset_index()
                column['year'] = yr
                column = column.set_index(['age','sex','year'])
                self.update(column)
def rolling_mean(data, window, min_periods=1, center=False):
    ''' Function that computes a rolling mean

    Parameters
    ----------
    data : DataFrame or Series
           If a DataFrame is passed, the rolling_mean is computed for all columns.
    window : int or string
             If int is passed, window is the number of observations used for calculating 
             the statistic, as defined by the function pd.rolling_mean()
             If a string is passed, it must be a frequency string, e.g. '90S'. This is
             internally converted into a DateOffset object, representing the window size.
    min_periods : int
                  Minimum number of observations in window required to have a value.

    Returns
    -------
    Series or DataFrame, if more than one column    
    '''
    def f(x):
        '''Function to apply that actually computes the rolling mean'''
        if center == False:
            dslice = col[x-pd.datetools.to_offset(window).delta+timedelta(0,0,1):x]
                # adding a microsecond because when slicing with labels start and endpoint
                # are inclusive
        else:
            dslice = col[x-pd.datetools.to_offset(window).delta/2+timedelta(0,0,1):
                         x+pd.datetools.to_offset(window).delta/2]
        if dslice.size < min_periods:
            return np.nan
        else:
            return dslice.mean()

    data = DataFrame(data.copy())
    dfout = DataFrame()
    if isinstance(window, int):
        dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center)
    elif isinstance(window, basestring):
        idx = Series(data.index.to_pydatetime(), index=data.index)
        for colname, col in data.iteritems():
            result = idx.apply(f)
            result.name = colname
            dfout = dfout.join(result, how='outer')
    if dfout.columns.size == 1:
        dfout = dfout.ix[:,0]
    return dfout
Example #6
0
    def test_sequence_like_with_categorical(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df['grade'] = Categorical(df['raw_grade'])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.iteritems():
            str(s)
Example #7
0
    def get_time_distribution(self):
        dates = self.oriDate[:]
        values = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        # 生成时间点和时间标签队列。
        periods = []
        axisLabels = []
        for i in xrange(24):
            periods.append(time(i))
            axisLabels.append(str(i) + u'点~' + str((i + 1) % 24) + u'点')

        # 时间点队列 -> 时间区间队列。
        periodRanges = []
        for i in xrange(len(periods)):
            periodRange = [periods[i], periods[(i + 1) % len(periods)]]
            periodRanges.append(periodRange)

        lTimes = map(lambda d: d.time(), dates)  # Keep time.
        vals = []  # Init vals
        for i in xrange(len(periods)):
            vals.append({})

        # Add to total vals.
        for i in xrange(len(lTimes)):
            for j in xrange(len(periodRanges)):
                if periodRanges[j][0] <= lTimes[i] < periodRanges[j][1]:
                    vals[j + 1] = helpers.mergeDict(vals[j + 1], values[i])

        df = DataFrame(vals)

        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0 if isnan(x) else int(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_timeDistribution = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_timeDistribution
def upsert_unique_indices(apps, schema_editor):
    datapoint_values_list = ['id','created_at','indicator_id','location_id','campaign_id','data_date']
    historical_dps = DataFrame(list(DataPoint.objects.all()\
        .values_list('id','created_at','indicator_id','location_id','campaign_id','data_date')), columns=datapoint_values_list)
    # create the unique index
    historical_dps = historical_dps.apply(add_unique_index, axis=1)

    # group by and max on created at, get the most recent upload
    historical_dps = historical_dps.sort("created_at", ascending=False).groupby("unique_index", as_index=False).first()

    # get the ids into a list and select them
    dps_to_update = DataPoint.objects.filter(id__in=list(historical_dps['id']))
    print 'dps to update'
    print len(dps_to_update)
    # then run a query and update each
    for dp in dps_to_update:
        unique_index = historical_dps[historical_dps['id'] == dp.id].iloc[0]['unique_index']
        dp.unique_index = unique_index
        dp.save()
    
    # delete all the other duplicates
    dps_to_delete = DataPoint.objects.all().exclude(id__in=list(historical_dps['id']))
    print 'dps_to_delete'
    print len(dps_to_delete)
    dps_to_delete.delete()


    dataframe_columns = ['id','created_at','indicator_id','location_id','campaign_id','data_date', 'unique_index']
    
    # make sure there aren't duplicate dps now.
    all_dps = DataFrame(list(DataPoint.objects.all()\
        .values_list('unique_index')), columns=['unique_index'])

    all_dps = all_dps.groupby('unique_index').size()

    for idx, dp in all_dps.iteritems():
    	if dp != 1:
    		raise Exception("there are duplicate datapoints")
Example #9
0
def run_clinical_real(cancer, clinical, data_path, gene_sets,
                      survival_tests, real_variables, binary_variables,
                      data_type='expression', drop_pc=False):
    
    if data_type == 'expression':
        data_matrix = read_rnaSeq(cancer, data_path)
        data_matrix = data_matrix.groupby(by=lambda n: n.split('|')[0]).mean()
    elif data_type == 'expression_array':
        data_matrix = read_mrna(cancer, data_path)
    elif data_type == 'methylation':
        data_matrix = read_methylation(cancer, data_path)
    if drop_pc:
        data_matrix = drop_first_norm_pc(data_matrix)
    pc = dict((p, extract_pc(data_matrix.ix[g])) for p, g in 
              gene_sets.iteritems())
    pc = DataFrame(dict((p, (v - v.mean()) / v.std()) for p,v in pc.iteritems() if 
                   type(v) != type(None))).T
    #clinical['pc'] = extract_pc(data_matrix.dropna(), pc_threshold=0)
    tests  = get_tests(clinical, survival_tests, real_variables, 
                       binary_variables, var_type='real')
    #return locals()
    p_pathways, q_pathways = run_tests(tests, pc)
    return locals()
Example #10
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Example #11
0
class TestHashing(object):

    def setup_method(self, method):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def test_hash_array_mixed(self):
        result1 = hash_array(np.array([3, 4, 'All']))
        result2 = hash_array(np.array(['3', '4', 'All']))
        result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
        tm.assert_numpy_array_equal(result1, result2)
        tm.assert_numpy_array_equal(result1, result3)

    def test_hash_array_errors(self):

        for val in [5, 'foo', pd.Timestamp('20130101')]:
            pytest.raises(TypeError, hash_array, val)

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            if len(obj):
                assert not (a == b).all()

    def test_hash_tuples(self):
        tups = [(1, 'one'), (1, 'two'), (2, 'one')]
        result = hash_tuples(tups)
        expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
        tm.assert_numpy_array_equal(result, expected)

        result = hash_tuples(tups[0])
        assert result == expected[0]

    def test_hash_tuple(self):
        # test equivalence between hash_tuples and hash_tuple
        for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
                    ('A', pd.Timestamp("2012-01-01"))]:
            result = hash_tuple(tup)
            expected = hash_tuples([tup])[0]
            assert result == expected

    def test_hash_scalar(self):
        for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
                    pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
                    datetime.datetime(2012, 1, 1),
                    pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
                    pd.Timedelta('1 days'), datetime.timedelta(1),
                    pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
                    np.nan, pd.NaT, None]:
            result = _hash_scalar(val)
            expected = hash_array(np.array([val], dtype=object),
                                  categorize=True)
            assert result[0] == expected[0]

    def test_hash_tuples_err(self):

        for val in [5, 'foo', pd.Timestamp('20130101')]:
            pytest.raises(TypeError, hash_tuples, val)

    def test_multiindex_unique(self):
        mi = MultiIndex.from_tuples([(118, 472), (236, 118),
                                     (51, 204), (102, 51)])
        assert mi.is_unique
        result = hash_pandas_object(mi)
        assert result.is_unique

    def test_multiindex_objects(self):
        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
                        names=['col1', 'col2'])
        recons = mi._sort_levels_monotonic()

        # these are equal
        assert mi.equals(recons)
        assert Index(mi.values).equals(Index(recons.values))

        # _hashed_values and hash_pandas_object(..., index=False)
        # equivalency
        expected = hash_pandas_object(
            mi, index=False).values
        result = mi._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = hash_pandas_object(
            recons, index=False).values
        result = recons._hashed_values
        tm.assert_numpy_array_equal(result, expected)

        expected = mi._hashed_values
        result = recons._hashed_values

        # values should match, but in different order
        tm.assert_numpy_array_equal(np.sort(result),
                                    np.sort(expected))

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Series(),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    DataFrame(),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex(),
                    tm.makePeriodIndex(),
                    Series(tm.makePeriodIndex()),
                    Series(pd.date_range('20130101',
                                         periods=3, tz='US/Eastern')),
                    MultiIndex.from_product(
                        [range(5),
                         ['foo', 'bar', 'baz'],
                         pd.date_range('20130101', periods=2)]),
                    MultiIndex.from_product(
                        [pd.CategoricalIndex(list('aabc')),
                         range(3)])]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3)

    def test_categorical_with_nan_consistency(self):
        c = pd.Categorical.from_codes(
            [-1, 0, 1, 2, 3, 4],
            categories=pd.date_range('2012-01-01', periods=5, name='B'))
        expected = hash_array(c, categorize=False)
        c = pd.Categorical.from_codes(
            [-1, 0],
            categories=[pd.Timestamp('2012-01-01')])
        result = hash_array(c, categorize=False)
        assert result[0] in expected
        assert result[1] in expected

    def test_pandas_errors(self):

        for obj in [pd.Timestamp('20130101')]:
            with pytest.raises(TypeError):
                hash_pandas_object(obj)

        with catch_warnings(record=True):
            obj = tm.makePanel()
        with pytest.raises(TypeError):
            hash_pandas_object(obj)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        assert (a != b).all()

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        pytest.raises(ValueError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            assert not result[0] == result[1]

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        tm.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
Example #12
0
class FrameParser(Parser):
    _default_orient = 'columns'

    def _parse(self):

        json = self.json
        dtype = self.dtype
        orient = self.orient
        numpy = self.numpy

        if numpy:
            try:
                if orient == "columns":
                    args = loads(json, dtype=dtype, numpy=True, labelled=True)
                    if args:
                        args = (args[0].T, args[2], args[1])
                    self.obj = DataFrame(*args)
                elif orient == "split":
                    decoded = loads(json, dtype=dtype, numpy=True)
                    decoded = dict((str(k), v) for k, v in decoded.iteritems())
                    self.obj = DataFrame(**decoded)
                elif orient == "values":
                    self.obj = DataFrame(loads(json, dtype=dtype, numpy=True))
                else:
                    self.obj = DataFrame(*loads(json, dtype=dtype, numpy=True,
                                         labelled=True))
            except ValueError:
                numpy = False

        if not numpy:
            if orient == "columns":
                self.obj = DataFrame(loads(json), dtype=dtype)
            elif orient == "split":
                decoded = dict((str(k), v)
                               for k, v in loads(json).iteritems())
                self.obj = DataFrame(dtype=dtype, **decoded)
            elif orient == "index":
                self.obj = DataFrame(loads(json), dtype=dtype).T
            else:
                self.obj = DataFrame(loads(json), dtype=dtype)

    def _convert_axes(self):
        """ try to axes if they are datelike """
        if self.orient == 'columns':
            axis = 'index'
        elif self.orient == 'index':
            axis = 'columns'
        else:
            return

        try:
            a = getattr(self.obj,axis)
            setattr(self.obj,axis,self._try_parse_to_date(a))
        except:
            pass

    def _try_parse_dates(self):
        if self.obj is None: return

        # our columns to parse
        parse_dates = self.parse_dates
        if parse_dates is True:
            parse_dates = []
        parse_dates = set(parse_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, basestring): return False

            if (col.endswith('_at') or
                col.endswith('_time') or
                col.lower() == 'modified' or
                col.lower() == 'date' or
                col.lower() == 'datetime'):
                    return True
            return False


        for col, c in self.obj.iteritems():
            if (self.keep_default_dates and is_ok(col)) or col in parse_dates:
                self.obj[col] = self._try_parse_to_date(c)
Example #13
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex()]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_categorical_consistency(self):
        # GH15143
        # Check that categoricals hash consistent with their values, not codes
        # This should work for categoricals of any dtype
        for s1 in [Series(['a', 'b', 'c', 'd']),
                   Series([1000, 2000, 3000, 4000]),
                   Series(pd.date_range(0, periods=4))]:
            s2 = s1.astype('category').cat.set_categories(s1)
            s3 = s2.cat.set_categories(list(reversed(s1)))
            for categorize in [True, False]:
                # These should all hash identically
                h1 = hash_pandas_object(s1, categorize=categorize)
                h2 = hash_pandas_object(s2, categorize=categorize)
                h3 = hash_pandas_object(s3, categorize=categorize)
                tm.assert_series_equal(h1, h2)
                tm.assert_series_equal(h1, h3)

    def test_errors(self):

        for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        self.assertRaises(ValueError, f)

    def test_unsupported_objects(self):

        # mixed objects are not supported
        obj = Series(['1', 2, 3])

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

        # MultiIndex are represented as tuples
        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
            [('a', 1), ('a', 2), ('b', 1)]))

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        self.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        self.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        self.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
Example #14
0
def get_sensitivity_analysis(extracts, points, statics, initials, pickle=None):

    temps = range(-5, 6)
    all_pct = [x * 0.1 for x in range(5, 16)]
    ndvi_range = linspace(0.9, 1.7, 11)
    ndvi_range = array([round_to_value(x, 0.05) for x in ndvi_range])
    var_arrs = []
    y = 0
    for x in range(0, 6):
        ones_ = ones((5, 11), dtype=float)
        zeros = [x * 0.0 for x in range(5, 16)]
        norm_ndvi = array([1.25 for x in zeros])
        if y == 0:
            arr = insert(ones_, y, temps, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            arr = []
        elif y == 4:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, y, ndvi_range, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            print 'shape arr: {}'.format(arr.shape)
            arr = []
        elif y == 5:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:5]
            arr = insert(arr, y, all_pct, axis=0)
            var_arrs.append(arr)
            arr = []
        else:
            arr = insert(ones_, 0, zeros, axis=0)
            arr = insert(arr, y, all_pct, axis=0)
            arr = insert(arr, 4, norm_ndvi, axis=0)
            arr = arr[0:6]
            var_arrs.append(arr)
            arr = []
        y += 1

    print 'variable arrays: {}'.format(var_arrs)
    normalize_list = [2, 0.20, 0.20, 2, 0.20, 0.50]

    # site_list = ['Bateman', 'Navajo_Whiskey_Ck', 'Quemazon', 'Sierra_Blanca', 'SB_1', 'SB_2', 'SB_4', 'SB_5', 'VC_1',
    #              'VC_2', 'VC_3', 'CH_1', 'CH_3', 'MG_1', 'MG_2', 'WHLR_PK', 'LP', 'South_Baldy',
    #              'Water_Canyon', 'La_Jencia', 'Socorro']

    site_list = ['Sierra_Blanca', 'Great_Western_Mine', 'Bonito', 'Nogal']
    df = DataFrame(columns=FACTORS, index=site_list)
    df_norm = DataFrame(columns=FACTORS, index=site_list)

    site_dict = {'Sierra_Blanca': {}, 'Great_Western_Mine': {}, 'Bonito': {}, 'Nogal': {}}
    ds = Open(points)
    lyr = ds.GetLayer()
    # defs = lyr.GetLayerDefn()

    for j, feat in enumerate(lyr):
        name = feat.GetField("Name")
        name = name.replace(' ', '_')
        geom = feat.GetGeometryRef()
        mx, my = int(geom.GetX()), int(geom.GetY())
        site_dict[name]['Coords'] = '{} {}'.format(mx, my)
        file_name = os.path.join(extracts, '{}.csv'.format(name))
        print file_name
        site_dict[name]['etrm'] = get_etrm_time_series(file_name, single_file=True)

    # print 'site dict before running etrm: {}'.format(site_dict)

    for i, var_arr in enumerate(var_arrs):
        factor = FACTORS[i]
        print 'running modified factor: {}'.format(factor)
        print ''
        for key, val in site_dict.iteritems():
            print '\n site: {} \n '.format(key)
            results = []
            for col in var_arr.T:
                etrm = Processes(SIMULATION_PERIOD, static_inputs=statics, initial_inputs=initials,
                                 output_root=pickle, point_dict=site_dict)
                tracker = etrm.run(point_dict=site_dict, point_dict_key=key, sensitivity_matrix_column=col,
                                   sensitivity=True)

                # print 'tracker: {}'.format(tracker)
                results.append(tracker['tot_infil'][-1])
                print 'total infil: {} \n results: {}'.format(tracker['tot_infil'][-1], results)

            df.iloc[site_list.index(key), FACTORS.index(factor)] = divide(array(results), 14.0)
        print 'df after site {}: \n {}'.format(key, df)
    print 'df: {}'.format(df)

    # tot_data : precip, et, tot_transp, tot_evap, infil, runoff, snow_fall, cum_mass, end_mass

    # "SI = [Q(Po + delP] -Q(Po - delP] / (2 * delP)"
    # where SI = Sensitivity Index, Q = recharge, Po = base value of input parameter,
    # delP = change in value input
    # find sensitivity index

    xx = 0
    for param in df.iteritems():
        data_cube = param[1]
        var_arr = var_arrs[xx]
        yy = 0
        for site in data_cube:
            site_name = site_list[yy]
            normal = normalize_list[xx]
            site_obj = [x for x in site]
            sens_list = []
            zz = 0
            for var in var_arr[xx]:
                if var != var_arr[xx][5]:
                    base = var_arr[xx][5]
                    deltap = var - base
                    obj = site_obj[zz]
                    sen = ((obj * (base + deltap) - obj * (base - deltap)) / (2 * deltap)) * normal
                    sens_list.append(sen)
                    zz += 1
            sens_list = array(sens_list)
            df_norm.iloc[site_list.index(site_name), FACTORS.index(param[0])] = sens_list
            if yy == 20:
                print 'done'
                break
            yy += 1
        xx += 1

    # why not save the data as pickle, so we don't have to do the analysis each time
    # we debug the plotting

    df.to_pickle(os.path.join(pickle, '_basic_sensitivity_2.pkl'))
    df_norm.to_pickle(os.path.join(pickle, 'norm_sensitivity_2.pkl'))
Example #15
0
class FrameParser(Parser):
    _default_orient = 'columns'
    _split_keys = ('columns', 'index', 'data')

    def _parse_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            args = loads(json, dtype=None, numpy=True, labelled=True,
                         precise_float=self.precise_float)
            if args:
                args = (args[0].T, args[2], args[1])
            self.obj = DataFrame(*args)
        elif orient == "split":
            decoded = loads(json, dtype=None, numpy=True,
                            precise_float=self.precise_float)
            decoded = dict((str(k), v) for k, v in compat.iteritems(decoded))
            self.check_keys_split(decoded)
            self.obj = DataFrame(**decoded)
        elif orient == "values":
            self.obj = DataFrame(loads(json, dtype=None, numpy=True,
                                       precise_float=self.precise_float))
        else:
            self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
                                        labelled=True,
                                        precise_float=self.precise_float))

    def _parse_no_numpy(self):

        json = self.json
        orient = self.orient

        if orient == "columns":
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None)
        elif orient == "split":
            decoded = dict((str(k), v)
                           for k, v in compat.iteritems(loads(
                               json,
                               precise_float=self.precise_float)))
            self.check_keys_split(decoded)
            self.obj = DataFrame(dtype=None, **decoded)
        elif orient == "index":
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None).T
        else:
            self.obj = DataFrame(
                loads(json, precise_float=self.precise_float), dtype=None)

    def _process_converter(self, f, filt=None):
        """ take a conversion function and possibly recreate the frame """

        if filt is None:
            filt = lambda col, c: True

        needs_new_obj = False
        new_obj = dict()
        for i, (col, c) in enumerate(self.obj.iteritems()):
            if filt(col, c):
                new_data, result = f(col, c)
                if result:
                    c = new_data
                    needs_new_obj = True
            new_obj[i] = c

        if needs_new_obj:

            # possibly handle dup columns
            new_obj = DataFrame(new_obj, index=self.obj.index)
            new_obj.columns = self.obj.columns
            self.obj = new_obj

    def _try_convert_types(self):
        if self.obj is None:
            return
        if self.convert_dates:
            self._try_convert_dates()

        self._process_converter(
            lambda col, c: self._try_convert_data(col, c, convert_dates=False))

    def _try_convert_dates(self):
        if self.obj is None:
            return

        # our columns to parse
        convert_dates = self.convert_dates
        if convert_dates is True:
            convert_dates = []
        convert_dates = set(convert_dates)

        def is_ok(col):
            """ return if this col is ok to try for a date parse """
            if not isinstance(col, compat.string_types):
                return False

            col_lower = col.lower()
            if (col_lower.endswith('_at') or
                    col_lower.endswith('_time') or
                    col_lower == 'modified' or
                    col_lower == 'date' or
                    col_lower == 'datetime' or
                    col_lower.startswith('timestamp')):
                return True
            return False

        self._process_converter(
            lambda col, c: self._try_convert_to_date(c),
            lambda col, c: ((self.keep_default_dates and is_ok(col)) or
                            col in convert_dates))
Example #16
0
class TestHashing(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.df = DataFrame(
            {'i32': np.array([1, 2, 3] * 3, dtype='int32'),
             'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
             'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
             'obj': Series(['d', 'e', 'f'] * 3),
             'bool': np.array([True, False, True] * 3),
             'dt': Series(pd.date_range('20130101', periods=9)),
             'dt_tz': Series(pd.date_range('20130101', periods=9,
                                           tz='US/Eastern')),
             'td': Series(pd.timedelta_range('2000', periods=9))})

    def test_consistency(self):
        # check that our hash doesn't change because of a mistake
        # in the actual code; this is the ground truth
        result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
        expected = Series(np.array([3600424527151052760, 1374399572096150070,
                                    477881037637427054], dtype='uint64'),
                          index=['foo', 'bar', 'baz'])
        tm.assert_series_equal(result, expected)

    def test_hash_array(self):
        for name, s in self.df.iteritems():
            a = s.values
            tm.assert_numpy_array_equal(hash_array(a), hash_array(a))

    def check_equal(self, obj, **kwargs):
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

        kwargs.pop('index', None)
        a = hash_pandas_object(obj, **kwargs)
        b = hash_pandas_object(obj, **kwargs)
        tm.assert_series_equal(a, b)

    def check_not_equal_with_index(self, obj):

        # check that we are not hashing the same if
        # we include the index
        if not isinstance(obj, Index):
            a = hash_pandas_object(obj, index=True)
            b = hash_pandas_object(obj, index=False)
            self.assertFalse((a == b).all())

    def test_hash_pandas_object(self):

        for obj in [Series([1, 2, 3]),
                    Series([1.0, 1.5, 3.2]),
                    Series([1.0, 1.5, np.nan]),
                    Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
                    Series(['a', 'b', 'c']),
                    Series(['a', np.nan, 'c']),
                    Series(['a', None, 'c']),
                    Series([True, False, True]),
                    Index([1, 2, 3]),
                    Index([True, False, True]),
                    DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
                    tm.makeMissingDataframe(),
                    tm.makeMixedDataFrame(),
                    tm.makeTimeDataFrame(),
                    tm.makeTimeSeries(),
                    tm.makeTimedeltaIndex()]:
            self.check_equal(obj)
            self.check_not_equal_with_index(obj)

    def test_hash_pandas_object2(self):
        for name, s in self.df.iteritems():
            self.check_equal(s)
            self.check_not_equal_with_index(s)

    def test_hash_pandas_empty_object(self):
        for obj in [Series([], dtype='float64'),
                    Series([], dtype='object'),
                    Index([])]:
            self.check_equal(obj)

            # these are by-definition the same with
            # or w/o the index as the data is empty

    def test_errors(self):

        for obj in [pd.Timestamp('20130101'), tm.makePanel()]:
            def f():
                hash_pandas_object(f)

            self.assertRaises(TypeError, f)

    def test_hash_keys(self):
        # using different hash keys, should have different hashes
        # for the same data

        # this only matters for object dtypes
        obj = Series(list('abc'))
        a = hash_pandas_object(obj, hash_key='9876543210123456')
        b = hash_pandas_object(obj, hash_key='9876543210123465')
        self.assertTrue((a != b).all())

    def test_invalid_key(self):
        # this only matters for object dtypes
        def f():
            hash_pandas_object(Series(list('abc')), hash_key='foo')
        self.assertRaises(ValueError, f)

    def test_unsupported_objects(self):

        # mixed objects are not supported
        obj = Series(['1', 2, 3])

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

        # MultiIndex are represented as tuples
        obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
            [('a', 1), ('a', 2), ('b', 1)]))

        def f():
            hash_pandas_object(obj)
        self.assertRaises(TypeError, f)

    def test_alread_encoded(self):
        # if already encoded then ok

        obj = Series(list('abc')).str.encode('utf8')
        self.check_equal(obj)

    def test_alternate_encoding(self):

        obj = Series(list('abc'))
        self.check_equal(obj, encoding='ascii')

    def test_long_strings(self):

        obj = Index(tm.rands_array(nchars=10000, size=100))
        self.check_equal(obj)
    def train(self, file_name=DEFAULT_TRAINING_FILE):
        """
        Takes in a training file formatted where the entire body of text is
        parsed each individual word or punctuation. On each line, there is
        also an identification stating the actual part of speech for that word.

        Args:
            file_name (str): The path to the file contain the training data
        """
        # Create connection to the file
        training_file = None

        # Open the specific training file
        if os.path.exists(file_name):
            try:
                training_file = open(file_name)
            except IOError:
                print("Unable to open the file at " + file_name + ".")

        # Pull probablistic data from these files
        tags = array_column(self.tags, "Tag")
        vocabulary = set()

        # Pandas copies memory over for appending so must iterate for unique
        # words first
        for line in training_file:
            # Ensure this doesn't just return an empty line
            line = line.strip()
            if len(line) > 0:
                # Parse line into 'observation/classification'
                words = line.split(' ')

                # Iterate over each word to get the word and classification
                for word in words:
                    # Separate into tag & classification
                    context = word.rsplit('/', maxsplit=1)
                    word = context[0].lower().strip()
                    if word not in vocabulary:
                        vocabulary.add(word)

        # Prepare necessary data structures
        emission = DataFrame(index=vocabulary, columns=tags)
        transition = DataFrame(index=tags, columns=tags)
        emission.fillna(0, inplace=True)
        transition.fillna(0, inplace=True)
        last_class = None

        # Iterate to update the emissions and
        training_file.seek(0)
        for line in training_file:
            # Ensure this doesn't just return an empty line
            line = line.strip()
            if len(line) > 0:
                # Parse line into 'observation/classification'
                words = line.split(' ')

                # Iterate over each word to get the word and classification
                for word in words:
                    # Separate into tag & classification
                    context = word.rsplit('/', maxsplit=1)
                    word = context[0].lower().strip()
                    context_tags = context[1].split('+')

                    # Update the emission matrix
                    for context_tag in context_tags:
                        emission[context_tag][word] += 1

                    # Update the transition
                    if last_class != None:
                        for context_tag in context_tags:
                            for last_tag in last_class:
                                transition[last_tag][context_tag] += 1

                    # Update the last_class
                    last_class = context_tags

        # Pull info from the database that needs to be updated & merge arrays
        cursor = self.connection.cursor()
        word_totals = {}
        tag_totals = {}
        for dest_tag, row in transition.iteritems():
            for origin_tag, occurrence in row.iteritems():
                # Retrieve total occurence data if it could not be found
                if origin_tag not in tag_totals:
                    cursor.execute('SELECT TotalOccurrences FROM Tags WHERE Tag = ?', (origin_tag.upper().strip(),))
                    tag_totals[origin_tag] = cursor.fetchone()
                    if tag_totals[origin_tag] == None:
                        cursor.execute('INSERT INTO Tags (Tag, TotalOccurrences) VALUES (?, ?)', (origin_tag.upper().strip(), 1))
                        tag_totals[origin_tag] = 1
                    else:
                        tag_totals[origin_tag] = int(tag_totals[origin_tag]['TotalOccurrences'])
                tag_totals[origin_tag] += int(occurrence)

                # Grab data for this specific transition
                cursor.execute('SELECT Occurrences FROM Transitions WHERE OriginTag = ? AND DestTag = ?', (origin_tag.upper().strip(), dest_tag))
                db_occurrence = cursor.fetchone()
                if db_occurrence == None:
                    # We need to add one if it doesn't exists
                    cursor.execute('INSERT INTO Transitions (OriginTag, DestTag, Occurrences) VALUES (?, ?, ?)', (origin_tag.upper().strip(), dest_tag, 0))
                    db_occurrence = 0
                else:
                    db_occurrence = db_occurrence['Occurrences']

                # Update the data
                db_occurrence += int(occurrence)
                cursor.execute('UPDATE Transitions SET Occurrences = ? WHERE OriginTag = ? AND DestTag = ?', (int(db_occurrence), origin_tag.upper().strip(), dest_tag))

        for tag, row in emission.iteritems():
            for word, occurrence in row.iteritems():
                # Retrieve total occurence data if it could not be found
                if word not in word_totals:
                    cursor.execute('SELECT TotalOccurrences FROM Words WHERE Word = ?', (word,))
                    word_totals[word] = cursor.fetchone()
                    if word_totals[word] == None:
                        cursor.execute('INSERT INTO Words (Word, TotalOccurrences) VALUES (?, ?)', (word, 1))
                        word_totals[word] = 1
                    else:
                        word_totals[word] = int(word_totals[word]['TotalOccurrences'])
                word_totals[word] += int(occurrence)

                # Grab data for this specific emission
                cursor.execute('SELECT Occurrences FROM Emissions WHERE Word = ? AND Tag = ?', (word, tag.upper().strip()))
                db_occurrence = cursor.fetchone()
                if db_occurrence == None:
                    # We need to add the entry
                    cursor.execute('INSERT INTO Emissions (Word, Tag, Occurrences) VALUES (?, ?, ?)', (word, tag.upper().strip(), 0))
                    db_occurrence = 0
                else:
                    db_occurrence = db_occurrence['Occurrences']

                # Update the data
                db_occurrence += int(occurrence)
                cursor.execute('UPDATE Emissions SET Occurrences = ? WHERE Word = ? AND Tag = ?', (int(db_occurrence), word, tag.upper().strip()))

        # Update totals in general
        for word, occurence in word_totals.items():
            cursor.execute('UPDATE Words SET TotalOccurrences = ? WHERE Word = ?', (int(occurence), word))
        for tag, occurence in tag_totals.items():
            cursor.execute('UPDATE Tags SET TotalOccurrences = ? WHERE tag = ?', (int(occurence), (tag.upper().strip())))

        # Close unnecessary resources
        cursor.close()
        self.connection.commit()
        training_file.close()