Ejemplo n.º 1
0
    def __init__(self):
        self.container = {}

        self.is_xsmoother = None

        if config.get('sketch', 'smoother') == 'XEWMASmoother':
            print 'Using XEWMASmoother.'
            self.is_xsmoother = True

        if config.get('sketch', 'smoother') == 'EWMASmoother':
            print 'Using EWMASmoother.'
            self.is_xsmoother = False
Ejemplo n.º 2
0
class SparseSmootherContainer():
    _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning'))

    def __init__(self):
        self.container = {}

        self.is_xsmoother = None

        if config.get('sketch', 'smoother') == 'XEWMASmoother':
            print 'Using XEWMASmoother.'
            self.is_xsmoother = True

        if config.get('sketch', 'smoother') == 'EWMASmoother':
            print 'Using EWMASmoother.'
            self.is_xsmoother = False

    def close(self):
        pass

    def _clean(self, _timestamp):
        to_be_cleaned_up = []
        for key, value in self.container.iteritems():
            tp = value.get(_timestamp)
            if not tp:
                print _timestamp, value.timestamp
                print 'stream item seems out of time order!'
                continue
            t ,v ,a = tp
            if v <= self._THRESHOLD_FOR_CLEANING: # check v
                to_be_cleaned_up.append(key)

        print 'cleaning', len(to_be_cleaned_up), 'items...'
        for key in to_be_cleaned_up:
            self.container.pop(key)

    def get(self, _id, _timestamp):
        # check for cleaning
        if len(self.container) > self._CAPACITY_FOR_CLEANING:
            self._clean(_timestamp)

        # return
        if _id in self.container:
            return self.container[_id]
        else:

            if self.is_xsmoother:
                _smoother = fast_smoother.XEWMASmoother()
            else:
                _smoother = fast_smoother.EWMASmoother()

            self.container[_id] = _smoother
            return _smoother
Ejemplo n.º 3
0
    def analyse_topics(self, _probs):
        words = set()
        for term in self.active_terms:
            for word in term[1]:
                words.add(word)
        print "size of words:", len(words)

        high_prob_words = []
        for _word in words:
            word = stemmer.stem(_word)
            hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE
            min_prob_list = []
            for h in range(fast_hashing.HASH_NUMBER):
                prob = _probs[h][hash_code[h]]
                min_prob_list.append(prob)

            min_prob_list.sort()
            min_prob = min_prob_list[1] # !!!
            if min_prob >= _PROBABILITY_THRESHOLD:
                high_prob_words.append((word, min_prob))

        # rescale
        s_prob = sum([p for w, p in high_prob_words])
        high_prob_words = [(w, p/s_prob) for w, p in high_prob_words]

        high_prob_words.sort(key=lambda x: x[1], reverse=True)

        # top 20
        high_prob_words = high_prob_words[:20]

        post_res = postprocessor.process(high_prob_words, self.active_terms)

        if eval(config.get('output', 'debug_info')):
            self.output.write('high_prob_words\n')
            self.output.write(str(high_prob_words)) #debugging
            self.output.write('\npost_res\n')
            self.output.write(str(post_res)) #debugging
            self.output.write('\n')

        flag, word_level_results, _ = post_res
        if flag:
            event = dict()
            event['detection_time'] = str(datetime.utcfromtimestamp(self.timestamp))
            event_words = list()
            for prob_word, word_flag in zip(high_prob_words, word_level_results):
                _word = prob_word[0]
                if word_flag:
                    event_words.append(_word)

            event['key_words'] = event_words

            self.output.write(json.dumps(event))
            self.output.write('\n')
Ejemplo n.º 4
0
def test(s):
    _uz = eval(exp_config.get('sketch', 'unit_size'))
    print 'set unit size ' + str(fast_smoother.set_unit_size(_uz))

    if exp_config.get('sketch', 'smoother') == 'XEWMASmoother':
        _wz = eval(exp_config.get('sketch', 'window_size'))
        print 'set XEWMASmoother window ' + str(
            fast_smoother.XEWMASmoother.set_window_size(_wz))

    if exp_config.get('sketch', 'smoother') == 'EWMASmoother':
        _wz1 = eval(exp_config.get('sketch', 'window_size1'))
        _wz2 = eval(exp_config.get('sketch', 'window_size2'))
        print 'set EWMASmoother window ' + str(
            fast_smoother.EWMASmoother.set_window_size(_wz1, _wz2))

    sketch = None
    if exp_config.get('sketch', 'type') == 'topicsketch':
        sketch = tps.TopicSketch()
    if exp_config.get('sketch', 'type') == 'topicsketchplus':
        sketch = tpsp.TopicSketchPlus()

    for item in s:
        sketch.process(item)

    if exp_config.get('sketch', 'type') == 'topicsketchplus':
        print 'a', sketch.a
    '''
    m = sketch.plot_sketch('m2', 'a').toarray()

    m = m[50:56, 50:56]

    print m


    p1 = np.array([[0.2, 0.1, 0.0, 0.7, 0.0, 0.0]])
    p1 = p1.T
    p2 = np.array([[0.0, 0.0, 0.0, 0.05, 0.25, 0.7]])
    p2 = p2.T

    #m_ = 18.5 * np.dot(p2, p2.T) + 13.5 * np.dot(p1, p1.T)# 11.6791335802, 0.284346678468
    m_ = 11.6791335802 * np.dot(p2, p2.T) + 0.284346678468 * np.dot(p1, p1.T)

    print m_'''

    _sketch_status = sketch.get_sketch()

    infer_result = examine.simplified_ex(None, _sketch_status, True)[0]

    a = infer_result[0]
    print a
    a = map(lambda x: x.real, a)

    _id = a.index(max(a))
    print 'id', _id

    #debugging
    #print infer_result[2][:_N_words, 0]
    #print infer_result[2][:_N_words, 1]
    ###############

    return infer_result[2][:_N_words, _id]
Ejemplo n.º 5
0
from scipy.sparse import dok_matrix
from scipy.sparse import csr_matrix
import numpy as np
import json

import fast_hashing
import fast_smoother
import solver
import stemmer
import postprocessor

import experiment.exp_config as config


_SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size'))

_NUM_TOPICS = eval(config.get('sketch', 'num_topics'))

_PROBABILITY_THRESHOLD = eval(config.get('sketch', 'probability_threshold'))
if _PROBABILITY_THRESHOLD == 0.:
    _PROBABILITY_THRESHOLD = 1./_SKETCH_BUCKET_SIZE

_ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size'))


class SparseSmootherContainer():
    _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning'))

    def __init__(self):
Ejemplo n.º 6
0
import stream
import experiment.exp_config as config
import clean_wb

_PUN_PATTERN = re.compile('^[' + punctuation + ']+$')

_SPACE_PATTERN = re.compile('^\s+')

_HTTP_PATTERN = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

_FILTERED_USERS = set([
    '@girlposts', '@FreddyAmazin', '@ChelseaFC', '@ComedyPosts', '@13elieveSG',
    '@ComedyTruth', '@ComedyPics'
])

_ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size'))


class ActiveTermMaintainer:
    def __init__(self):
        self.active_terms = deque([])

    def add(self, item):
        self.active_terms.append(item)

        while len(self.active_terms) > 0:
            term = self.active_terms[0]
            if term.timestamp < item.timestamp - _ACTIVE_WINDOW_SIZE * 60:
                self.active_terms.popleft()
            else:
                break
Ejemplo n.º 7
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

from nltk.stem.lancaster import *
from nltk.stem.snowball import *
from nltk.stem.porter import *

import experiment.exp_config as config

stemmer = None

if config.get('pre_process', 'stemmer') == 'Snowball':
    stemmer = SnowballStemmer("english")

if config.get('pre_process', 'stemmer') == 'Porter':
    stemmer = PorterStemmer()

if config.get('pre_process', 'stemmer') == 'Lancaster':
    stemmer = LancasterStemmer()


def stem(word):
    if stemmer is None:
        return word

    try:
        ret = stemmer.stem(word)
        ret = str(ret)
    except:
Ejemplo n.º 8
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Living Analytics Research Centre, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import experiment.exp_config as config

_THRESHOLD_FOR_SIMILARITY = eval(
    config.get('post_process', 'threshold_for_similarity'))
_THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_USERS = eval(
    config.get('post_process', 'topic_number_related_users'))
_THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_TWEETS = eval(
    config.get('post_process', 'topic_number_related_tweets'))
_THRESHOLD_WORD_LEVEL_NUMBER_RELATED_USERS = eval(
    config.get('post_process', 'word_number_related_users'))
_THRESHOLD_WORD_LEVEL_NUMBER_RELATED_TWEETS = eval(
    config.get('post_process', 'word_number_related_tweets'))


def similarity(set_words, tokens):
    s = 0.

    for token in tokens:
        if token in set_words:
            s += 1

    return s


def process(
        high_prob_words,
Ejemplo n.º 9
0
from scipy.sparse import dok_matrix
from scipy.sparse import csr_matrix
import numpy as np

import fast_hashing
import solver
import fast_smoother
import stemmer
import postprocessor

import experiment.exp_config as config

import experiment.event_output as event_output


_SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size'))

_NUM_TOPICS = eval(config.get('sketch', 'num_topics'))

_PROBABILITY_THRESHOLD = eval(config.get('sketch', 'probability_threshold'))

_ACTIVE_WINDOW_SIZE = eval(config.get('sketch', 'active_window_size'))

_CUT_TIMESTAMP = eval(config.get('sketch', 'cut_timestamp'))

_MAX_NUMBER_WORDS = eval(config.get('sketch', 'max_number_words'))


class SparseSmootherContainer():
    _THRESHOLD_FOR_CLEANING = eval(config.get('sketch', 'threshold_for_cleaning'))
    _CAPACITY_FOR_CLEANING = eval(config.get('sketch', 'capacity_for_cleaning'))
Ejemplo n.º 10
0
def simplified_ex(_fstr, _sketch_status=None, direct=False):
    if _fstr:
        _f = gzip.open(_fstr, 'rb')
        sketch_status = cpickle.load(_f)
        _f.close()
    else:
        sketch_status = _sketch_status

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]

    #######################
    mat = _m2[0]
    x = []  # for debugging
    for i in xrange(_SKETCH_BUCKET_SIZE):
        x.append(mat[i, i])

    id = np.argmax(np.array(x))
    for _w in _words:
        w = stemmer.stem(_w)
        if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id:
            print 'significant', _w
    #######################

    H = fast_hashing.HASH_NUMBER
    K = eval(config.get('sketch', 'num_topics'))  #15

    infer_results = map(
        lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K),
        range(H))

    if direct:
        return infer_results

    ### debugging
    print 'Inference finished.'
    ############

    transactions = []
    topics_group = []
    for h in xrange(H):
        topics = dict()
        a, r, v = infer_results[h]
        a_max = max(np.array(a).real)
        print a_max
        for k in xrange(K):
            s = set()
            topic = set()
            prob = v[:, k]

            prob = remove_negative_terms(prob)

            # filtering
            if a[k].real < 0.1 * a_max:  #1.0:
                continue
            if entropy(prob) > 6.0:
                continue

            _ranks = dict()
            for _w in _words:
                w = stemmer.stem(_w)
                p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
                _ranks[w] = p
                if p >= 0.0100:
                    s.add(w)
                if p >= 0.0075:
                    topic.add(w)

            _tops = sorted(_ranks.keys(),
                           key=lambda x: _ranks[x],
                           reverse=True)
            _top_n = 15
            if len(s) > _top_n:
                transactions.append(
                    apriori.Transaction(set(_tops[:_top_n]), h, k))
                #print _top_n
            else:
                transactions.append(apriori.Transaction(s, h, k))
                #print len(s)

            topics[k] = topic

            print h, k, a[k].real, map(lambda w, h: (w, h, _ranks[w]), s,
                                       hash_code(s, h))  # for debugging

        topics_group.append(topics)

    ### debugging
    print 'starting apriori.'
    #############

    output = apriori.apriori(transactions, 4)
    _result = dict()
    _result['time'] = _t
    _result['topics'] = list()

    print _t
    for ws in output:
        '''
        if support_distance(ws.support) > 5:
            continue'''

        _result['topics'].append((connect_words(recover(ws.words, _words)), connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \
            np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \
            np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))))

    if _fstr:
        out_file = open('E:/experiment/results/' + _fstr.split('/')[-1], 'wb')
        cpk.dump(_result, out_file)
        out_file.close()
    else:
        return _result
Ejemplo n.º 11
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import numpy
import datetime
import gzip
import cPickle as cpickle
import topic_sketch.fast_hashing as hashing
import matplotlib.pyplot as plt
import numpy as np
import experiment.exp_config as config

_SKETCH_BUCKET_SIZE = eval(config.get('sketch', 'sketch_bucket_size'))


def pairs(mat, words):
    shape = mat.shape
    n = shape[1]

    for w1 in words:
        for w2 in words:

            hashcode = numpy.array(hashing.hash_code(w1)) % n
            h1 = hashcode[0]

            hashcode = numpy.array(hashing.hash_code(w2)) % n
            h2 = hashcode[0]

            if h1 > h2:
Ejemplo n.º 12
0
__author__ = 'Wei Xie'
__email__ = '*****@*****.**'
__affiliation__ = 'Pinnacle Lab for Analytics, Singapore Management University'
__website__ = 'http://mysmu.edu/phdis2012/wei.xie.2012'

import experiment.exp_config as config

_THRESHOLD_FOR_SIMILARITY = eval(config.get('post_process', 'threshold_for_similarity'))
_THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_USERS = eval(config.get('post_process', 'topic_number_related_users'))
_THRESHOLD_TOPIC_LEVEL_NUMBER_RELATED_TWEETS = eval(config.get('post_process', 'topic_number_related_tweets'))
_THRESHOLD_WORD_LEVEL_NUMBER_RELATED_USERS = eval(config.get('post_process', 'word_number_related_users'))
_THRESHOLD_WORD_LEVEL_NUMBER_RELATED_TWEETS = eval(config.get('post_process', 'word_number_related_tweets'))


def similarity(set_words, tokens):
    s = 0.

    for token in tokens:
        if token in set_words:
            s += 1

    return s


def process(high_prob_words, active_terms):  # high_prob_words and active_terms from topic_sketch

    set_words = set()
    for prob_word in high_prob_words:
        set_words.add(prob_word[0])

    # for topic level