def collect_counts(data, lex): tag_pair_counts = [Counter() for l in range(L)] tag_word_pair_counts = [Counter() for m in range(M)] sent = data def get_word(pos): if pos < 0 or pos >= len(sent): word = "<S>" else: word = sent[pos][0] return lex["words"][word] def get_tag(pos): if pos < 0 or pos >= len(sent): tag = "<T>" else: tag = sent[pos][1] return lex["tags"][tag] for i in range(len(sent)): for l in range(L): tag_pair_counts[l][get_tag(i - l - 1), get_tag(i)] += 1 start = i - ((M - 1) / 2) for m in range(M): tag_word_pair_counts[m][get_tag(i), get_word(start + m)] += 1 return { "tag": tag_pair_counts, "word": tag_word_pair_counts, "total": len(sent) }
def test_copying(self): # Check that counters are copyable, deepcopyable, picklable, and #have a repr/eval round-trip words = Counter('which witch had which witches wrist watch'.split()) update_test = Counter() update_test.update(words) for i, dup in enumerate([ words.copy(), copy.copy(words), copy.deepcopy(words), pickle.loads(pickle.dumps(words, 0)), pickle.loads(pickle.dumps(words, 1)), pickle.loads(pickle.dumps(words, 2)), pickle.loads(pickle.dumps(words, -1)), cPickle.loads(cPickle.dumps(words, 0)), cPickle.loads(cPickle.dumps(words, 1)), cPickle.loads(cPickle.dumps(words, 2)), cPickle.loads(cPickle.dumps(words, -1)), eval(repr(words)), update_test, Counter(words), ]): msg = (i, dup, words) self.assertTrue(dup is not words) self.assertEqual(dup, words) self.assertEqual(len(dup), len(words)) self.assertEqual(type(dup), type(words))
def test_conversions(self): # Convert to: set, list, dict s = 'she sells sea shells by the sea shore' self.assertEqual(sorted(Counter(s).elements()), sorted(s)) self.assertEqual(sorted(Counter(s)), sorted(set(s))) self.assertEqual(dict(Counter(s)), dict(Counter(s).items())) self.assertEqual(set(Counter(s)), set(s))
def results(self): if self._error != None: return {"error": self._error} values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, 1:, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata
def test_subtract(self): c = Counter(a=-5, b=0, c=5, d=10, e=15, g=40) c.subtract(a=1, b=2, c=-3, d=10, e=20, f=30, h=-50) self.assertEqual( c, Counter(a=-6, b=-2, c=8, d=0, e=-5, f=-30, g=40, h=50)) c = Counter(a=-5, b=0, c=5, d=10, e=15, g=40) c.subtract(Counter(a=1, b=2, c=-3, d=10, e=20, f=30, h=-50)) self.assertEqual( c, Counter(a=-6, b=-2, c=8, d=0, e=-5, f=-30, g=40, h=50)) c = Counter('aaabbcd') c.subtract('aaaabbcce') self.assertEqual(c, Counter(a=-1, b=0, c=-1, d=1, e=-1))
def results(self): values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, :, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, int(devid)].tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata
def write_pseudo_ngrams(in_file, out_file, K): "Construct the counts for the p-ngram model." ngrams = Counter() with open(out_file, 'w') as f: for l in open(in_file): words = l.strip().split() words = ["<s>"] * K + words + ["</s>"] * (K + 1) for i in range(K, len(words) - K): total = [] for k in range(1, K + 1): total.append(words[i - k]) total.append(words[i + k]) total.reverse() total = total + [words[i]] for j in range(len(total)): ngrams[tuple(total[j:])] += 1 for l in ngrams: print >> f, " ".join(l), ngrams[l]
def make_moments(lintext, vocounts, vocab, out_file, K, N): paircounts = [Counter() for k in range(K)] for (i, wrd) in enumerate(lintext[:len(lintext) - K]): for k in range(1, K + 1): paircounts[k - 1][(wrd, lintext[i + k])] += 1 #print moments with open(out_file, 'w') as f: print >> f, N, K + 1 for _ in range(K + 1): print >> f, len(vocab) for k in range(K): print >> f, len(paircounts[k].keys()) for (a, b), count in paircounts[k].most_common(): print >> f, vocab[a], vocab[b], count
def get_minmaxmed_data(self, j, indices, settings): d = numpy.zeros((len(j.hosts), len(self.times) - 1)) for hostidx, host in enumerate(j.hosts.itervalues()): data = self.getvalues(settings['metric'], settings['formula'], settings['interfaces'], j, host, indices) if data is None: return None d[hostidx, :] = data sortarr = numpy.argsort(d, axis=0) results = { "min": self.collatedata(sortarr[0, :], d), "max": self.collatedata(sortarr[-1, :], d), "med": self.collatedata(sortarr[sortarr.shape[0] / 2, :], d), "times": self.times[1:].tolist(), "hosts": {} } # Ensure head node is always in list uniqhosts = Counter([0]) uniqhosts.update(sortarr[0, :]) uniqhosts.update(sortarr[-1, :]) uniqhosts.update(sortarr[sortarr.shape[0] / 2, :]) for hostidx, host in enumerate(j.hosts.itervalues()): if hostidx not in uniqhosts.keys(): continue results['hosts'][str(hostidx)] = {} if ('devicebased' in settings) and (settings['devicebased'] == True): devicedata, devnames = self.getdevicevalues( settings['metric'], settings['formula'], settings['interfaces'], j, host, indices) results['hosts'][str(hostidx)]["dev"] = devicedata results['hosts'][str(hostidx)]["names"] = devnames results['hosts'][str(hostidx)]["all"] = d[hostidx].tolist() return results
def results(self): if len(self._hostdata) != self._job.nodecount: return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE} for hcount in self._hostcounts.itervalues(): if hcount['missing'] > hcount['present']: return {'error': ProcessingError.CPUSET_UNKNOWN} values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, :, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() return retdata
def results(self): if len(self._hostdata) != self._job.nodecount: return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, 1:, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() return retdata
def make_vocab(data, val_data, test_data): train_counts = Counter() word_set = set() tag_set = set() train_counts.update((word for word, _ in data)) all_words = data + val_data + test_data word_set.update((word for word, _ in all_words)) tag_set.update((tag for _, tag in all_words)) word_dict = {"<S>": 0} tag_dict = {"<T>": 0} for word in word_set: word_dict.setdefault(word, len(word_dict)) for tag in tag_set: tag_dict.setdefault(tag, len(tag_dict)) return {"tags": tag_dict, "words": word_dict, "word_counts": train_counts}
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-K', '--K', help="Window size.", type=int) parser.add_argument('-t', '--train', help="Training file.", type=str) parser.add_argument('-v', '--valid', help="Validation file", type=str) args = parser.parse_args(arguments) vocab_file = args.train + '_vocab_K' + str(args.K) + '.dat' train_moments_file = args.train + '_moments_K' + str(args.K) + '.dat' valid_moments_file = args.valid + '_moments_K' + str(args.K) + '.dat' train_text_file = args.train + '_text_K' + str(args.K) + '.dat' valid_text_file = args.valid + '_text_K' + str(args.K) + '.dat' pngram_file = args.train + '_pngram_K' + str(args.K) + '.dat' K = args.K txt, lintext, N = get_text(args.train, args.K) valid_txt, valid_lintext, valN = get_text(args.valid, args.K) vocounts = Counter() vocounts['<unk>'] = 0 vocounts.update(lintext) vocab = dict([(w, i) for i, (w, _) in enumerate(vocounts.most_common())]) with open(vocab_file, 'w') as f: for i, (w, c) in enumerate(vocounts.most_common()): print >> f, i, w, c make_moments(lintext, vocounts, vocab, train_moments_file, args.K, N) make_moments(valid_lintext, vocounts, vocab, valid_moments_file, args.K, valN) write_words(txt, train_text_file, vocab, args.K) write_words(valid_txt, valid_text_file, vocab, args.K) write_pseudo_ngrams(args.train, pngram_file, args.K)
def test_multiset_operations(self): # Verify that adding a zero counter will strip zeros and negatives c = Counter(a=10, b=-2, c=0) + Counter() self.assertEqual(dict(c), dict(a=10)) elements = 'abcd' for i in range(1000): # test random pairs of multisets p = Counter(dict((elem, randrange(-2, 4)) for elem in elements)) p.update(e=1, f=-1, g=0) q = Counter(dict((elem, randrange(-2, 4)) for elem in elements)) q.update(h=1, i=-1, j=0) for counterop, numberop in [ (Counter.__add__, lambda x, y: max(0, x + y)), (Counter.__sub__, lambda x, y: max(0, x - y)), (Counter.__or__, lambda x, y: max(0, x, y)), (Counter.__and__, lambda x, y: max(0, min(x, y))), ]: result = counterop(p, q) for x in elements: self.assertEqual(numberop(p[x], q[x]), result[x], (counterop, x, p, q)) # verify that results exclude non-positive counts self.assertTrue(x > 0 for x in result.values()) elements = 'abcdef' for i in range(100): # verify that random multisets with no repeats are exactly like sets p = Counter(dict((elem, randrange(0, 2)) for elem in elements)) q = Counter(dict((elem, randrange(0, 2)) for elem in elements)) for counterop, setop in [ (Counter.__sub__, set.__sub__), (Counter.__or__, set.__or__), (Counter.__and__, set.__and__), ]: counter_result = counterop(p, q) set_result = setop(set(p.elements()), set(q.elements())) self.assertEqual(counter_result, dict.fromkeys(set_result, 1))
def test_basics(self): c = Counter('abcaba') self.assertEqual(c, Counter({'a': 3, 'b': 2, 'c': 1})) self.assertEqual(c, Counter(a=3, b=2, c=1)) self.assertIsInstance(c, dict) self.assertIsInstance(c, Mapping) self.assertTrue(issubclass(Counter, dict)) self.assertTrue(issubclass(Counter, Mapping)) self.assertEqual(len(c), 3) self.assertEqual(sum(c.values()), 6) self.assertEqual(sorted(c.values()), [1, 2, 3]) self.assertEqual(sorted(c.keys()), ['a', 'b', 'c']) self.assertEqual(sorted(c), ['a', 'b', 'c']) self.assertEqual(sorted(c.items()), [('a', 3), ('b', 2), ('c', 1)]) self.assertEqual(c['b'], 2) self.assertEqual(c['z'], 0) with test_support.check_py3k_warnings(): self.assertEqual(c.has_key('c'), True) self.assertEqual(c.has_key('z'), False) self.assertEqual(c.__contains__('c'), True) self.assertEqual(c.__contains__('z'), False) self.assertEqual(c.get('b', 10), 2) self.assertEqual(c.get('z', 10), 10) self.assertEqual(c, dict(a=3, b=2, c=1)) self.assertEqual(repr(c), "Counter({'a': 3, 'b': 2, 'c': 1})") self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)]) for i in range(5): self.assertEqual(c.most_common(i), [('a', 3), ('b', 2), ('c', 1)][:i]) self.assertEqual(''.join(sorted(c.elements())), 'aaabbc') c['a'] += 1 # increment an existing value c['b'] -= 2 # sub existing value to zero del c['c'] # remove an entry del c['c'] # make sure that del doesn't raise KeyError c['d'] -= 2 # sub from a missing value c['e'] = -5 # directly assign a missing value c['f'] += 4 # add to a missing value self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4)) self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff') self.assertEqual(c.pop('f'), 4) self.assertNotIn('f', c) for i in range(3): elem, cnt = c.popitem() self.assertNotIn(elem, c) c.clear() self.assertEqual(c, {}) self.assertEqual(repr(c), 'Counter()') self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc') self.assertRaises(TypeError, hash, c) c.update(dict(a=5, b=3)) c.update(c=1) c.update(Counter('a' * 50 + 'b' * 30)) c.update() # test case with no args c.__init__('a' * 500 + 'b' * 300) c.__init__('cdc') c.__init__() self.assertEqual(c, dict(a=555, b=333, c=3, d=1)) self.assertEqual(c.setdefault('d', 5), 1) self.assertEqual(c['d'], 1) self.assertEqual(c.setdefault('e', 5), 5) self.assertEqual(c['e'], 5)
def test_invariant_for_the_in_operator(self): c = Counter(a=10, b=-2, c=0) for elem in c: self.assertTrue(elem in c) self.assertIn(elem, c)