def hash_code(s, h): output = [] for w in s: h_v = hashing.hash_code(stemmer.stem(w))[h] % _SKETCH_BUCKET_SIZE output.append(h_v) return output
def pairs(mat, words): shape = mat.shape n = shape[1] for w1 in words: for w2 in words: hashcode = numpy.array(hashing.hash_code(w1)) % n h1 = hashcode[0] hashcode = numpy.array(hashing.hash_code(w2)) % n h2 = hashcode[0] if h1 > h2: continue v = mat[h1, h2] if v > 0.005: print(w1, w2, v)
def collision(mat, words): shape = mat.shape n = shape[1] counts = {} table = {} for i in xrange(_SKETCH_BUCKET_SIZE): counts[i] = 0 table[i] = [] for w in words: hashcode = numpy.array(hashing.hash_code(w)) % n id = hashcode[0] counts[id] += 1 table[id].append(w) plt.plot(map(lambda x: counts[x], counts)) plt.show() print table
def simplified_ex(_fstr, _sketch_status=None, direct=False): if _fstr: _f = gzip.open(_fstr, 'rb') sketch_status = cpickle.load(_f) _f.close() else: sketch_status = _sketch_status _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print 'significant', _w ####################### H = fast_hashing.HASH_NUMBER K = eval(config.get('sketch', 'num_topics')) #15 infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(H)) if direct: return infer_results ### debugging print 'Inference finished.' ############ transactions = [] topics_group = [] for h in xrange(H): topics = dict() a, r, v = infer_results[h] a_max = max(np.array(a).real) print a_max for k in xrange(K): s = set() topic = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 0.1 * a_max: #1.0: continue if entropy(prob) > 6.0: continue _ranks = dict() for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] _ranks[w] = p if p >= 0.0100: s.add(w) if p >= 0.0075: topic.add(w) _tops = sorted(_ranks.keys(), key=lambda x: _ranks[x], reverse=True) _top_n = 15 if len(s) > _top_n: transactions.append( apriori.Transaction(set(_tops[:_top_n]), h, k)) #print _top_n else: transactions.append(apriori.Transaction(s, h, k)) #print len(s) topics[k] = topic print h, k, a[k].real, map(lambda w, h: (w, h, _ranks[w]), s, hash_code(s, h)) # for debugging topics_group.append(topics) ### debugging print 'starting apriori.' ############# output = apriori.apriori(transactions, 4) _result = dict() _result['time'] = _t _result['topics'] = list() print _t for ws in output: ''' if support_distance(ws.support) > 5: continue''' _result['topics'].append((connect_words(recover(ws.words, _words)), connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))))) if _fstr: out_file = open('E:/experiment/results/' + _fstr.split('/')[-1], 'wb') cpk.dump(_result, out_file) out_file.close() else: return _result
def ex(_fstr): _f = gzip.open(_fstr, 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print _w ####################### H = 5 K = 15 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() transactions = [] topics_group = [] for h in xrange(H): topics = dict() a, r, v = infer_results[h] for k in xrange(K): s = set() topic = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p >= 0.0250: s.add(w) if p >= 0.0150: topic.add(w) transactions.append(apriori.Transaction(s, h, k)) topics[k] = topic print h, k, a[k].real, map(lambda w, h: (w, h), s, hash_code(s, h)) # for debugging topics_group.append(topics) ''' output = apriori.apriori(transactions, 3) for ws in output: print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print '-------------------------------' ''' output = apriori.apriori(transactions, 4) for ws in output: print '[' print ws.support, support_distance(ws.support) print connect_words(recover(ws.words, _words)), np.max( np.array( map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print ']' print '-------------------------------' ''' output = apriori.apriori(transactions, 5) for ws in output: print '[' print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print ']' print '-------------------------------' ''' print 't1 = ' + str(time.time() - t)
def ex5(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print _w ####################### H = 5 K = 10 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() candidates = [] more_candidates = [] for h in xrange(H): a, r, v = infer_results[h] candidate = [] more_candidate = [] for k in xrange(K): s = set() more_s = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p >= 0.025: s.add(_w) if p >= 0.015: more_s.add(_w) candidate.append(s) more_candidate.append(more_s) candidates.append(candidate) more_candidates.append(more_candidate) for h in xrange(H): print '------------------------------' for k in xrange(len(candidates[h])): print candidates[h][k] print '------------------------------' index = choose(candidates) for h in xrange(H): a, r, v = infer_results[h] plt.plot(v[:, h].real) plt.show() for h in xrange(H): print candidates[h][index[h]] topic_words = more_candidates[0][index[0]] for h in xrange(1, H): topic_words = topic_words.intersection(more_candidates[h][index[h]]) output = '' for w in topic_words: output = output + w + ',' print output print 't1 = ' + str(time.time() - t)
def ex4(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140128_21_52_28', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] H = 5 K = 50 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() candidates = [] for h in xrange(H): a, r, v = infer_results[h] candidate = [] for k in xrange(K): s = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p > 0.01: s.add(_w) candidate.append(s) candidates.append(candidate) for h in xrange(H): print '------------------------------' for k in xrange(len(candidates[h])): print candidates[h][k] print '------------------------------' topic_words = candidates[0][-1] for h in xrange(1, H): topic_words = topic_words.union(candidates[h][-1]) output = '' for w in topic_words: support = 0 for h in xrange(H): if w in candidates[h][-1]: support += 1 if support >= H - 1: output = output + w + ',' print output print 't1 = ' + str(time.time() - t)
def ex2(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ''' plt.matshow(numpy.absolute(m.toarray()[2400:2500, 2400:2500]), fignum=None, cmap=plt.cm.gray) plt.colorbar() plt.show() ''' ''' for h in xrange(5): a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, 5) print sorted(a, key=lambda x: np.abs(x)) #infer_results = map(lambda _h : solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, 5), range(fast_hashing.HASH_NUMBER)) ''' h = 0 K = 10 mat = _m2[h] x = [] for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) plt.plot(x) plt.show() index = np.argmax(np.array(x)) print index for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE == index: print _w ''' for y in sorted(x): print x.index(y), y ''' a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, K) print a print r print v[index, :] sorted_a = sorted(a, reverse=True) #k = a.index(max(a, key=lambda x: x.real)) for _k in xrange(K): k = a.index(sorted_a[_k]) prob = v[:, k] prob = remove_negative_terms(prob) print k, sorted_a[_k] print 'entropy', k, entropy(prob) plt.plot(prob) plt.show() for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p > 0.025: print _w, p print '########################################'