Beispiel #1
0
def test_nat_operations():
    # GH 8617
    s = Series([0, pd.NaT], dtype='m8[ns]')
    exp = s[0]
    assert s.median() == exp
    assert s.min() == exp
    assert s.max() == exp
    def test_median(self):
        string_series = tm.makeStringSeries().rename('series')
        self._check_stat_op('median', np.median, string_series)

        # test with integers, test failure
        int_ts = Series(np.ones(10, dtype=int), index=lrange(10))
        tm.assert_almost_equal(np.median(int_ts), int_ts.median())
Beispiel #3
0
	def _crunch_all(self, unit):
		"""Call all statistic-calculating methods for each unit with data."""

		unit.calculate_GVI_and_PGS()

		s = Series(unit.just_readings)

		unit.summary = s.describe()

		unit.median = s.median()
Beispiel #4
0
  def get_summary_indicators_from_hist(sf, hist, int_index=False):
    seriesHist = Series(hist)
    maxs = {
      'freq': dict()
    }
    
    means = {'freq': seriesHist.mean()}
    medians = {'freq': seriesHist.median()}
    stds = {'freq': seriesHist.std()}
    maxs['freq']['freq'] = seriesHist.max()
    maxs['freq']['index'] = seriesHist.idxmax()
    index_total = 'NA'

    if int_index:
      index = seriesHist.index
      index = index.astype(int)
      index_list = index.tolist()
      index_total = sum([seriesHist[i] * index_list[i] for i in range(len(index_list))])
      index_series = Series(index_list)

      means['index'] = index_series.mean()
      medians['index'] = index_series.median()
      stds['index'] = index_series.std()
      
      maxs['freq']['index'] = int(maxs['freq']['index'])

      maxs['index'] = dict()
      maxs['index']['index'] = max(index_list)
      maxs['index']['freq'] = hist[str(maxs['index']['index'])]

    return {
      'means': means,
      'medians': medians,
      'stds': stds,
      'max': maxs,
      'index_total': index_total
    }
Beispiel #5
0
def _esd(x, max_outlier, alpha, direction):
    """
    The ESD test using median and MAD in the calculation of the test statistic.
    """
    x = Series(x)
    n = len(x)
    outlier_index = []
    for i in range(1, max_outlier + 1):
        median = x.median()
        mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT
        if mad == 0:
            break
        if direction == 'both':
            ares = x.map(lambda value: abs(value - median) / mad)
        elif direction == 'pos':
            ares = x.map(lambda value: (value - median) / mad)
        elif direction == 'neg':
            ares = x.map(lambda value: (median - value) / mad)
        r_idx = ares.idxmax()
        r = ares[r_idx]
        if direction == 'both':
            p = 1.0 - alpha / (2 * (n - i + 1))
        else:
            p = 1.0 - alpha / (n - i + 1)
        crit = t.ppf(p, n-i-1)
        lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" %
                         (i, max_outlier, median, mad, r_idx, r, crit, lam))
        if r > lam:
            outlier_index.append(r_idx)
            x = x.drop(r_idx)
        else:
            # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time,
            # we can stop.
            break
    return outlier_index
# A data series does not have to be of homogeneous type.
# However, many of the manipulation which you'll perform probably assume
# homogeneity.
s = Series( { "a": 42, "b": "foo", "c": 42J } )
pprint( s )

# <demo> --- stop ---

s = Series(
    randn( 5 )
)
pprint( s )

# Data series are not only Python sequences but also act like NumPy 'ndarray'
# objects.
print( s[s > s.median()] )
pprint( np.exp(s) )

# Data series act like Python 'dict' objects as well.
print( 1 in s )

# Labels with no corresponding values use NaN for their missing values.
# This is true both during series initialization and alignment.
pprint( s[ 1 : ] + s[ : -1 ] )

# <demo> --- stop ---

# 'DataFrame' objects are 2D.
# They can be created from a 'dict' of Series objects, a 2D array, etc...

df = DataFrame( { "col1": randn( 4 ), "col2": randn( 4 ) } )
trigrams = {}
for line in lines:
    trigram = line.strip().lower()[0:3]
    if len(trigram) >= 3 and not nonalphabet.search(trigram):
        if trigram == "aaa":
            print "line: {0} trigram: {1}".format(line, trigram)
        trigrams.setdefault(trigram, 0)
        trigrams[trigram] += 1

trigram_series = Series(trigrams.values(), index=trigrams.keys())
trigram_series.sort(inplace=True, ascending=True)
print trigram_series
print "quartiles:\n{0}".format(trigram_series.quantile([.25, .50, .75, .99]).to_string())

print "median is: {0}".format(trigram_series.median())
unique_trigrams = []
for trigram, count in trigrams.iteritems():
    if count > trigram_series.quantile(.50):
        unique_trigrams.append(trigram)
    unique_trigrams.append(trigram)

print "saving trigrams"
with open("trigrams.json", "w") as f:
    json.dump(unique_trigrams, f)
print "saved {0} trigrams".format(len(unique_trigrams))

trie = {}
for trigram in unique_trigrams:
    current_dict = trie
    for index, letter in enumerate(trigram):
# <codecell>

Series(d, index=['b', 'c', 'd', 'a'])

# <codecell>

s[1]

# <codecell>

s[:3]   

# <codecell>

s[s > s.median()]

# <codecell>

np.exp(s)

# <codecell>

s['evan']=9
s

# <codecell>

'bob' in s

# <codecell>