def to_latex_table_old(time, sig, topic): max_count = 15 output_string = '' output_string += '\hline\n' output_string += (str(time) + '&') output_string += (str(sig) + '&') key_words = set(map(lambda x: stemmer.stem(x), topic[0].split(','))) #key_words = topic[0].split(',') words = set(map(lambda x: stemmer.stem(x), topic[1].split(','))) #words = topic[1].split(',') count = 0 for word in key_words: if len(word) > 0: output_string += ('\\textbf{' + word + '},') count += 1 if count >= max_count: break for word in words: if word not in key_words: if len(word) > 0: output_string += (word + ',') count += 1 if count >= max_count: break output_string = output_string[:-1] + '\\\\' return output_string.replace(',', ' ')
def to_latex_table(time, sig, topics): max_count = 15 # filtering... topics = filter( lambda topic: len( set(map(lambda x: stemmer.stem(x), topic[0].split(',')))) > 2, topics) # sort topics = sorted(topics, key=lambda topic: topic[3], reverse=True) # median if len(topics) == 0: return None num_row = len(topics) output_string = '' output_string += '\cline{1-3}\n' output_string += ('\multirow{' + str(num_row) + '}{*}{' + str(time).split(' ')[0] + '}&') output_string += ('\multirow{' + str(num_row) + '}{*}{' + str(sig) + '}&') first_time = True for topic in topics: key_words = set(map(lambda x: stemmer.stem(x), topic[0].split(','))) #key_words = topic[0].split(',') if len(key_words) <= 2: # include '' continue #words = set(map(lambda x: stemmer.stem(x), topic[1].split(','))) words = topic[1].split(',') if not first_time: output_string += '\n\cline{3-3}\n' output_string += '&&' else: first_time = False count = 0 for word in key_words: if len(word) > 0: output_string += ('\\textbf{' + word + '},') count += 1 if count >= max_count: break for word in words: if word not in key_words: if len(word) > 0: output_string += (word + ',') count += 1 if count >= max_count: break output_string = output_string[:-1] + '\\\\' return output_string.replace(',', ' ')
def hash_code(s, h): output = [] for w in s: h_v = hashing.hash_code(stemmer.stem(w))[h] % _SKETCH_BUCKET_SIZE output.append(h_v) return output
def recover(s, words): output = set() for word in words: if stemmer.stem(word) in s: if words[word] >= 5: #!!! output.add(word) return output
def process(self, _ptweet): self.timestamp = _ptweet.timestamp tokens = _ptweet.tokens # stemming tokens = map(lambda x: stemmer.stem(x), tokens) if len(tokens) < 3: return None set_of_tokens = set() for token in tokens: set_of_tokens.add(token) result_list = list() for token1 in set_of_tokens: for token2 in set_of_tokens: if ',' in token1 or ',' in token2: continue if token1 >= token2: continue list_of_tokens = [token1, token2] list_of_tokens.sort() token = list_of_tokens[0] + ',' + list_of_tokens[1] count, ewma, ewmavar, sig = self.sig_scorers.get(token, self.timestamp).observe(self.timestamp, 1.0) if sig > _SIGNI_THRESHOLD: result_list.append((count, ewma, ewmavar, sig, token)) if len(result_list) > 0: tokens = set() for result in result_list: token = result[4] kws = token.split(',') for kw in kws: tokens.add(kw) m_sig = max(map(lambda x: x[3], result_list)) #print 'SIG', m_sig, '#' + '#'.join(tokens) + '#' #!!! do not display return _ptweet.datetime(), 0, 0, 0, m_sig, tokens, _ptweet return None
def simplified_ex(_fstr, _sketch_status=None, direct=False): if _fstr: _f = gzip.open(_fstr, 'rb') sketch_status = cpickle.load(_f) _f.close() else: sketch_status = _sketch_status _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print 'significant', _w ####################### H = fast_hashing.HASH_NUMBER K = eval(config.get('sketch', 'num_topics')) #15 infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(H)) if direct: return infer_results ### debugging print 'Inference finished.' ############ transactions = [] topics_group = [] for h in xrange(H): topics = dict() a, r, v = infer_results[h] a_max = max(np.array(a).real) print a_max for k in xrange(K): s = set() topic = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 0.1 * a_max: #1.0: continue if entropy(prob) > 6.0: continue _ranks = dict() for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] _ranks[w] = p if p >= 0.0100: s.add(w) if p >= 0.0075: topic.add(w) _tops = sorted(_ranks.keys(), key=lambda x: _ranks[x], reverse=True) _top_n = 15 if len(s) > _top_n: transactions.append( apriori.Transaction(set(_tops[:_top_n]), h, k)) #print _top_n else: transactions.append(apriori.Transaction(s, h, k)) #print len(s) topics[k] = topic print h, k, a[k].real, map(lambda w, h: (w, h, _ranks[w]), s, hash_code(s, h)) # for debugging topics_group.append(topics) ### debugging print 'starting apriori.' ############# output = apriori.apriori(transactions, 4) _result = dict() _result['time'] = _t _result['topics'] = list() print _t for ws in output: ''' if support_distance(ws.support) > 5: continue''' _result['topics'].append((connect_words(recover(ws.words, _words)), connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))))) if _fstr: out_file = open('E:/experiment/results/' + _fstr.split('/')[-1], 'wb') cpk.dump(_result, out_file) out_file.close() else: return _result
def ex(_fstr): _f = gzip.open(_fstr, 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print _w ####################### H = 5 K = 15 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() transactions = [] topics_group = [] for h in xrange(H): topics = dict() a, r, v = infer_results[h] for k in xrange(K): s = set() topic = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p >= 0.0250: s.add(w) if p >= 0.0150: topic.add(w) transactions.append(apriori.Transaction(s, h, k)) topics[k] = topic print h, k, a[k].real, map(lambda w, h: (w, h), s, hash_code(s, h)) # for debugging topics_group.append(topics) ''' output = apriori.apriori(transactions, 3) for ws in output: print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print '-------------------------------' ''' output = apriori.apriori(transactions, 4) for ws in output: print '[' print ws.support, support_distance(ws.support) print connect_words(recover(ws.words, _words)), np.max( np.array( map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print ']' print '-------------------------------' ''' output = apriori.apriori(transactions, 5) for ws in output: print '[' print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))) print ']' print '-------------------------------' ''' print 't1 = ' + str(time.time() - t)
def ex5(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print _w ####################### H = 5 K = 10 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() candidates = [] more_candidates = [] for h in xrange(H): a, r, v = infer_results[h] candidate = [] more_candidate = [] for k in xrange(K): s = set() more_s = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p >= 0.025: s.add(_w) if p >= 0.015: more_s.add(_w) candidate.append(s) more_candidate.append(more_s) candidates.append(candidate) more_candidates.append(more_candidate) for h in xrange(H): print '------------------------------' for k in xrange(len(candidates[h])): print candidates[h][k] print '------------------------------' index = choose(candidates) for h in xrange(H): a, r, v = infer_results[h] plt.plot(v[:, h].real) plt.show() for h in xrange(H): print candidates[h][index[h]] topic_words = more_candidates[0][index[0]] for h in xrange(1, H): topic_words = topic_words.intersection(more_candidates[h][index[h]]) output = '' for w in topic_words: output = output + w + ',' print output print 't1 = ' + str(time.time() - t)
def ex4(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140128_21_52_28', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] H = 5 K = 50 t = time.time() infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(fast_hashing.HASH_NUMBER)) print 't0 = ' + str(time.time() - t) t = time.time() candidates = [] for h in xrange(H): a, r, v = infer_results[h] candidate = [] for k in xrange(K): s = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 1.0: continue if entropy(prob) > 6.0: continue for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p > 0.01: s.add(_w) candidate.append(s) candidates.append(candidate) for h in xrange(H): print '------------------------------' for k in xrange(len(candidates[h])): print candidates[h][k] print '------------------------------' topic_words = candidates[0][-1] for h in xrange(1, H): topic_words = topic_words.union(candidates[h][-1]) output = '' for w in topic_words: support = 0 for h in xrange(H): if w in candidates[h][-1]: support += 1 if support >= H - 1: output = output + w + ',' print output print 't1 = ' + str(time.time() - t)
def ex2(): _f = gzip.open( '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22', 'rb') sketch_status = cpickle.load(_f) _f.close() _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ''' plt.matshow(numpy.absolute(m.toarray()[2400:2500, 2400:2500]), fignum=None, cmap=plt.cm.gray) plt.colorbar() plt.show() ''' ''' for h in xrange(5): a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, 5) print sorted(a, key=lambda x: np.abs(x)) #infer_results = map(lambda _h : solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, 5), range(fast_hashing.HASH_NUMBER)) ''' h = 0 K = 10 mat = _m2[h] x = [] for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) plt.plot(x) plt.show() index = np.argmax(np.array(x)) print index for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE == index: print _w ''' for y in sorted(x): print x.index(y), y ''' a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, K) print a print r print v[index, :] sorted_a = sorted(a, reverse=True) #k = a.index(max(a, key=lambda x: x.real)) for _k in xrange(K): k = a.index(sorted_a[_k]) prob = v[:, k] prob = remove_negative_terms(prob) print k, sorted_a[_k] print 'entropy', k, entropy(prob) plt.plot(prob) plt.show() for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] if p > 0.025: print _w, p print '########################################'
def to_html_table(time, sig, topics): max_count = 9 # sort topics = sorted(topics, key=lambda topic: topic[3], reverse=True) # median if len(topics) == 0: return None num_row = 0 output_string = '' first_time = True for topic in topics: key_words = set(map(lambda x: stemmer.stem(x), topic[0].split(','))) original_key_words = topic[0].split(',') '''' if len(original_key_words) <= 1: # include '' continue''' words = set(map(lambda x: stemmer.stem(x), topic[1].split(','))) original_words = topic[1].split(',') if len(original_words) <= 2: # include '' continue if not first_time: output_string += '<tr>\n' else: first_time = False output_string += '<td>\n' output_string += html_link(original_words, time, max_count) count = 0 for word in original_key_words: if len(word) > 0: output_string += (html_bold(word) + ' ') count += 1 if count >= max_count: break for word in original_words: if word not in key_words: if len(word) > 0: output_string += (word + ' ') count += 1 if count >= max_count: break #output_string = output_string[:-1] + '</a>' + str(topic[3]) + '</td>\n' output_string = output_string[:-1] + '</a>' + '</td>\n' output_string += '</tr>\n' num_row += 1 if num_row == 0: return None output_string = '<td rowspan="' + str(num_row) + '">' + str( time) + '</td>\n' + output_string output_string = '<tr>\n' + output_string return output_string