def __init__(self): self.container = {} self.is_xsmoother = None if config.get('sketch', 'smoother') == 'XEWMASmoother': print 'Using XEWMASmoother.' self.is_xsmoother = True if config.get('sketch', 'smoother') == 'EWMASmoother': print 'Using EWMASmoother.' self.is_xsmoother = False
class SparseSmootherContainer(): _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning')) def __init__(self): self.container = {} self.is_xsmoother = None if config.get('sketch', 'smoother') == 'XEWMASmoother': print 'Using XEWMASmoother.' self.is_xsmoother = True if config.get('sketch', 'smoother') == 'EWMASmoother': print 'Using EWMASmoother.' self.is_xsmoother = False def close(self): pass def _clean(self, _timestamp): to_be_cleaned_up = [] for key, value in self.container.iteritems(): tp = value.get(_timestamp) if not tp: print _timestamp, value.timestamp print 'stream item seems out of time order!' continue t ,v ,a = tp if v <= self._THRESHOLD_FOR_CLEANING: # check v to_be_cleaned_up.append(key) print 'cleaning', len(to_be_cleaned_up), 'items...' for key in to_be_cleaned_up: self.container.pop(key) def get(self, _id, _timestamp): # check for cleaning if len(self.container) > self._CAPACITY_FOR_CLEANING: self._clean(_timestamp) # return if _id in self.container: return self.container[_id] else: if self.is_xsmoother: _smoother = fast_smoother.XEWMASmoother() else: _smoother = fast_smoother.EWMASmoother() self.container[_id] = _smoother return _smoother
def analyse_topics(self, _probs): words = set() for term in self.active_terms: for word in term[1]: words.add(word) print "size of words:", len(words) high_prob_words = [] for _word in words: word = stemmer.stem(_word) hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE min_prob_list = [] for h in range(fast_hashing.HASH_NUMBER): prob = _probs[h][hash_code[h]] min_prob_list.append(prob) min_prob_list.sort() min_prob = min_prob_list[1] # !!! if min_prob >= _PROBABILITY_THRESHOLD: high_prob_words.append((word, min_prob)) # rescale s_prob = sum([p for w, p in high_prob_words]) high_prob_words = [(w, p/s_prob) for w, p in high_prob_words] high_prob_words.sort(key=lambda x: x[1], reverse=True) # top 20 high_prob_words = high_prob_words[:20] post_res = postprocessor.process(high_prob_words, self.active_terms) if eval(config.get('output', 'debug_info')): self.output.write('high_prob_words\n') self.output.write(str(high_prob_words)) #debugging self.output.write('\npost_res\n') self.output.write(str(post_res)) #debugging self.output.write('\n') flag, word_level_results, _ = post_res if flag: event = dict() event['detection_time'] = str(datetime.utcfromtimestamp(self.timestamp)) event_words = list() for prob_word, word_flag in zip(high_prob_words, word_level_results): _word = prob_word[0] if word_flag: event_words.append(_word) event['key_words'] = event_words self.output.write(json.dumps(event)) self.output.write('\n')
def test(s): _uz = eval(exp_config.get('sketch', 'unit_size')) print 'set unit size ' + str(fast_smoother.set_unit_size(_uz)) if exp_config.get('sketch', 'smoother') == 'XEWMASmoother': _wz = eval(exp_config.get('sketch', 'window_size')) print 'set XEWMASmoother window ' + str( fast_smoother.XEWMASmoother.set_window_size(_wz)) if exp_config.get('sketch', 'smoother') == 'EWMASmoother': _wz1 = eval(exp_config.get('sketch', 'window_size1')) _wz2 = eval(exp_config.get('sketch', 'window_size2')) print 'set EWMASmoother window ' + str( fast_smoother.EWMASmoother.set_window_size(_wz1, _wz2)) sketch = None if exp_config.get('sketch', 'type') == 'topicsketch': sketch = tps.TopicSketch() if exp_config.get('sketch', 'type') == 'topicsketchplus': sketch = tpsp.TopicSketchPlus() for item in s: sketch.process(item) if exp_config.get('sketch', 'type') == 'topicsketchplus': print 'a', sketch.a ''' m = sketch.plot_sketch('m2', 'a').toarray() m = m[50:56, 50:56] print m p1 = np.array([[0.2, 0.1, 0.0, 0.7, 0.0, 0.0]]) p1 = p1.T p2 = np.array([[0.0, 0.0, 0.0, 0.05, 0.25, 0.7]]) p2 = p2.T #m_ = 18.5 * np.dot(p2, p2.T) + 13.5 * np.dot(p1, p1.T)# 11.6791335802, 0.284346678468 m_ = 11.6791335802 * np.dot(p2, p2.T) + 0.284346678468 * np.dot(p1, p1.T) print m_''' _sketch_status = sketch.get_sketch() infer_result = examine.simplified_ex(None, _sketch_status, True)[0] a = infer_result[0] print a a = map(lambda x: x.real, a) _id = a.index(max(a)) print 'id', _id #debugging #print infer_result[2][:_N_words, 0] #print infer_result[2][:_N_words, 1] ############### return infer_result[2][:_N_words, _id]
from scipy.sparse import dok_matrix from scipy.sparse import csr_matrix import numpy as np import json import fast_hashing import fast_smoother import solver import stemmer import postprocessor import experiment.exp_config as config _SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size')) _NUM_TOPICS = eval(config.get('sketch', 'num_topics')) _PROBABILITY_THRESHOLD = eval(config.get('sketch', 'probability_threshold')) if _PROBABILITY_THRESHOLD == 0.: _PROBABILITY_THRESHOLD = 1./_SKETCH_BUCKET_SIZE _ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size')) class SparseSmootherContainer(): _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning')) def __init__(self):
import stream import experiment.exp_config as config import clean_wb _PUN_PATTERN = re.compile('^[' + punctuation + ']+$') _SPACE_PATTERN = re.compile('^\s+') _HTTP_PATTERN = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' _FILTERED_USERS = set([ '@girlposts', '@FreddyAmazin', '@ChelseaFC', '@ComedyPosts', '@13elieveSG', '@ComedyTruth', '@ComedyPics' ]) _ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size')) class ActiveTermMaintainer: def __init__(self): self.active_terms = deque([]) def add(self, item): self.active_terms.append(item) while len(self.active_terms) > 0: term = self.active_terms[0] if term.timestamp < item.timestamp - _ACTIVE_WINDOW_SIZE * 60: self.active_terms.popleft() else: break
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' from nltk.stem.lancaster import * from nltk.stem.snowball import * from nltk.stem.porter import * import experiment.exp_config as config stemmer = None if config.get('pre_process', 'stemmer') == 'Snowball': stemmer = SnowballStemmer("english") if config.get('pre_process', 'stemmer') == 'Porter': stemmer = PorterStemmer() if config.get('pre_process', 'stemmer') == 'Lancaster': stemmer = LancasterStemmer() def stem(word): if stemmer is None: return word try: ret = stemmer.stem(word) ret = str(ret) except:
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Living Analytics Research Centre, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import experiment.exp_config as config _THRESHOLD_FOR_SIMILARITY = eval( config.get('post_process', 'threshold_for_similarity')) _THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_USERS = eval( config.get('post_process', 'topic_number_related_users')) _THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_TWEETS = eval( config.get('post_process', 'topic_number_related_tweets')) _THRESHOLD_WORD_LEVEL_NUMBER_RELATED_USERS = eval( config.get('post_process', 'word_number_related_users')) _THRESHOLD_WORD_LEVEL_NUMBER_RELATED_TWEETS = eval( config.get('post_process', 'word_number_related_tweets')) def similarity(set_words, tokens): s = 0. for token in tokens: if token in set_words: s += 1 return s def process( high_prob_words,
from scipy.sparse import dok_matrix from scipy.sparse import csr_matrix import numpy as np import fast_hashing import solver import fast_smoother import stemmer import postprocessor import experiment.exp_config as config import experiment.event_output as event_output _SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size')) _NUM_TOPICS = eval(config.get('sketch', 'num_topics')) _PROBABILITY_THRESHOLD = eval(config.get('sketch', 'probability_threshold')) _ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size')) _CUT_TIMESTAMP = eval(config.get('sketch', 'cut_timestamp')) _MAX_NUMBER_WORDS = eval(config.get('sketch', 'max_number_words')) class SparseSmootherContainer(): _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning')) _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning'))
def simplified_ex(_fstr, _sketch_status=None, direct=False): if _fstr: _f = gzip.open(_fstr, 'rb') sketch_status = cpickle.load(_f) _f.close() else: sketch_status = _sketch_status _t = datetime.datetime.utcfromtimestamp(sketch_status[0]) _words = sketch_status[1] _m2 = sketch_status[2] _m3 = sketch_status[3] ####################### mat = _m2[0] x = [] # for debugging for i in xrange(_SKETCH_BUCKET_SIZE): x.append(mat[i, i]) id = np.argmax(np.array(x)) for _w in _words: w = stemmer.stem(_w) if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id: print 'significant', _w ####################### H = fast_hashing.HASH_NUMBER K = eval(config.get('sketch', 'num_topics')) #15 infer_results = map( lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K), range(H)) if direct: return infer_results ### debugging print 'Inference finished.' ############ transactions = [] topics_group = [] for h in xrange(H): topics = dict() a, r, v = infer_results[h] a_max = max(np.array(a).real) print a_max for k in xrange(K): s = set() topic = set() prob = v[:, k] prob = remove_negative_terms(prob) # filtering if a[k].real < 0.1 * a_max: #1.0: continue if entropy(prob) > 6.0: continue _ranks = dict() for _w in _words: w = stemmer.stem(_w) p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE] _ranks[w] = p if p >= 0.0100: s.add(w) if p >= 0.0075: topic.add(w) _tops = sorted(_ranks.keys(), key=lambda x: _ranks[x], reverse=True) _top_n = 15 if len(s) > _top_n: transactions.append( apriori.Transaction(set(_tops[:_top_n]), h, k)) #print _top_n else: transactions.append(apriori.Transaction(s, h, k)) #print len(s) topics[k] = topic print h, k, a[k].real, map(lambda w, h: (w, h, _ranks[w]), s, hash_code(s, h)) # for debugging topics_group.append(topics) ### debugging print 'starting apriori.' ############# output = apriori.apriori(transactions, 4) _result = dict() _result['time'] = _t _result['topics'] = list() print _t for ws in output: ''' if support_distance(ws.support) > 5: continue''' _result['topics'].append((connect_words(recover(ws.words, _words)), connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \ np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \ np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))))) if _fstr: out_file = open('E:/experiment/results/' + _fstr.split('/')[-1], 'wb') cpk.dump(_result, out_file) out_file.close() else: return _result
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import numpy import datetime import gzip import cPickle as cpickle import topic_sketch.fast_hashing as hashing import matplotlib.pyplot as plt import numpy as np import experiment.exp_config as config _SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size')) def pairs(mat, words): shape = mat.shape n = shape[1] for w1 in words: for w2 in words: hashcode = numpy.array(hashing.hash_code(w1)) % n h1 = hashcode[0] hashcode = numpy.array(hashing.hash_code(w2)) % n h2 = hashcode[0] if h1 > h2:
__author__ = 'Wei Xie' __email__ = '*****@*****.**' __affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University' __website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012' import experiment.exp_config as config _THRESHOLD_FOR_SIMILARITY = eval(config.get('post_process', 'threshold_for_similarity')) _THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_USERS = eval(config.get('post_process', 'topic_number_related_users')) _THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_TWEETS = eval(config.get('post_process', 'topic_number_related_tweets')) _THRESHOLD_WORD_LEVEL_NUMBER_RELATED_USERS = eval(config.get('post_process', 'word_number_related_users')) _THRESHOLD_WORD_LEVEL_NUMBER_RELATED_TWEETS = eval(config.get('post_process', 'word_number_related_tweets')) def similarity(set_words, tokens): s = 0. for token in tokens: if token in set_words: s += 1 return s def process(high_prob_words, active_terms): # high_prob_words and active_terms from topic_sketch set_words = set() for prob_word in high_prob_words: set_words.add(prob_word[0]) # for topic level