tweet['trigrams'] = set(bigrams.filtered_trigrams(toks)) for trigram in tweet['trigrams']: self.model.add(trigram) self.index[trigram].append(tweet) #self.tweets_by_text.append(tweet) #for ngram in set(bigrams.multi_ngrams(toks, n_and_up=3)): # pass def fill_from_tweet_iter(self, tweet_iter): for tweet in tweet_iter: self.add_tweet(tweet) if __name__ == '__main__': import cPickle as pickle import search q = sys.argv[1] smoothing = sys.argv[2] bg_model = lang_model.TokyoLM(readonly=True) lc = LinkedCorpus() tweet_iter = search.cleaned_results(q, pages=2, key_fn=search.user_and_text_identity, save=None, load=None) lc.fill_from_tweet_iter(tweet_iter) for ratio, ngram in lc.model.compare_with_bg_model( bg_model, 3, min_count=3, smoothing_algorithm=smoothing): print "%s\t%s" % ('_'.join(ngram), ratio)
self.bigram_index[None, bigram[1]].append(bigram) tweet['trigrams'] = set(bigrams.filtered_trigrams(toks)) for trigram in tweet['trigrams']: self.model.add(trigram) self.index[trigram].append(tweet) #self.tweets_by_text.append(tweet) #for ngram in set(bigrams.multi_ngrams(toks, n_and_up=3)): # pass def fill_from_tweet_iter(self, tweet_iter): for tweet in tweet_iter: self.add_tweet(tweet) if __name__=='__main__': import cPickle as pickle import search q = sys.argv[1] smoothing = sys.argv[2] bg_model = lang_model.TokyoLM(readonly=True) lc = LinkedCorpus() tweet_iter = search.cleaned_results(q, pages = 2, key_fn = search.user_and_text_identity, save = None, load = None ) lc.fill_from_tweet_iter(tweet_iter) for ratio, ngram in lc.model.compare_with_bg_model(bg_model, 3, min_count=3, smoothing_algorithm=smoothing): print "%s\t%s" % ('_'.join(ngram), ratio)
def the_app(environ, start_response): global_init() status = '200 OK' opts = Opts( environ, opt('q', default=''), opt('pages', default=2), opt('split', default=0), opt('simple', default=0), opt('max_topics', default=40), opt('ncol', default=3), opt('save', default=False), opt('load', default=False), opt('smoothing', default='lidstone'), opt('single_query', default=0), opt('format', default='dev'), ) print "OPTIONS %s" % (opts, ) response_headers = [('Content-type', 'text/html')] start_response(status, response_headers) if opts.single_query: # the requery opts2 = Opts(environ, opt('q', str), opt('topic_label', str), opt('exclude', default='')) opts2.exclude = [int(x) for x in opts2.exclude.split()] for x in single_query(**opts2): yield x return lc = linkedcorpus.LinkedCorpus() tweets_file = 'saved_tweets/save_%s_tweets' % opts.q tweet_iter = search.cleaned_results( opts.q, pages=opts.pages, key_fn=search.user_and_text_identity, save=tweets_file if opts.save else None, load=tweets_file if opts.load else None) tweet_iter = deduper.merge_multitweets(tweet_iter) lc.fill_from_tweet_iter(tweet_iter) q_toks = bigrams.tokenize_and_clean(opts.q, True) res = ranking.extract_topics(lc, background_model, **opts) groups_by_tweet_id = deduper.dedupe(lc) for topic in res.topics: deduper.groupify_topic(topic, groups_by_tweet_id) ranking.late_topic_clean(res, max_topics=opts.max_topics) ranking.gather_leftover_tweets(res, lc) if res.topics and res.topics[-1].groups is None: deduper.groupify_topic(res.topics[-1], groups_by_tweet_id) for topic in res.topics: topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets]) for group in topic.groups: group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams) group.rest_htmls = [ nice_tweet(t, q_toks, topic.label_ngrams) for t in group.rest ] for topic in res.topics: topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True) if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) time_since_earliest = nice_timedelta(datetime.utcnow() - earliest) else: time_since_earliest = None if opts.format == 'pickle': # pickle.dumps(res) is 800k with dump/load = 100ms/60ms # trimmed json-like version is 150k with dump/load = 5ms/2ms. yield pickle.dumps(res) return if opts.format == 'json': topic_info = dict((t.label, { 'label': t.label, 'nice_label': nice_label(t.label), 'tweet_ids': t.tweet_ids, 'groups': [{ 'head_html': g.head_html, 'rest_htmls': g.rest_htmls } for g in t.groups], 'query_refinement': ranking.query_refinement(opts.q, t), }) for t in res.topics) topic_list = [t.label for t in res.topics] results = { 'topic_list': topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest, } yield simplejson.dumps(results) return if opts.format != 'dev': raise Exception("bad format") for topic in res.topics: topic.tweets_html = topic_group_html(topic.groups) bigass_topic_dict = dict((t.label, dict( label=t.label, tweets_html=t.tweets_html, tweet_ids=t.tweet_ids, )) for t in res.topics) yield page_header() yield form_area(opts) yield "<table><tr>" yield "<th>topics" if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) #latest = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) s = "for %d tweets" % len(lc.tweets_by_id) s += " over the last %s" % nice_timedelta(datetime.utcnow() - earliest) yield " <small>%s</small>" % s yield "<th>tweets" yield "<tr><td valign=top id=topic_list>" topic_labels = [ '''<span class="topic_label" onclick="topic_click(this)" topic_label="%s" >%s</span><small> %d, %d</small><br>''' % (cgi.escape( topic.label), topic.label, topic.group_count, topic.tweet_count) for topic in res.topics ] for x in table_byrow(topic_labels, ncol=opts.ncol): yield x yield "<td valign=top>" yield "<div id=tweets>" yield "click on a topic on the left please" yield "</div>" yield "<div id=tweets_more>" yield "</div>" yield "</table>" yield "<script>" yield "topics = " yield simplejson.dumps(bigass_topic_dict) yield ";" yield "load_default_topic();" yield "</script>"
def the_app(environ, start_response): global_init() status = '200 OK' opts = Opts(environ, opt('q', default=''), opt('pages', default=2), opt('split', default=0), opt('simple', default=0), opt('max_topics', default=40), opt('ncol', default=3), opt('save', default=False), opt('load', default=False), opt('smoothing', default='lidstone'), opt('single_query', default=0), opt('format', default='dev'), ) print "OPTIONS %s" % (opts,) response_headers = [('Content-type','text/html')] start_response(status, response_headers) if opts.single_query: # the requery opts2 = Opts(environ, opt('q',str), opt('topic_label',str), opt('exclude',default='')) opts2.exclude = [int(x) for x in opts2.exclude.split()] for x in single_query(**opts2): yield x return lc = linkedcorpus.LinkedCorpus() tweets_file = 'saved_tweets/save_%s_tweets' % opts.q tweet_iter = search.cleaned_results(opts.q, pages = opts.pages, key_fn = search.user_and_text_identity, save = tweets_file if opts.save else None, load = tweets_file if opts.load else None ) tweet_iter = deduper.merge_multitweets(tweet_iter) lc.fill_from_tweet_iter(tweet_iter) q_toks = bigrams.tokenize_and_clean(opts.q, True) res = ranking.extract_topics(lc, background_model, **opts) groups_by_tweet_id = deduper.dedupe(lc) for topic in res.topics: deduper.groupify_topic(topic, groups_by_tweet_id) ranking.late_topic_clean(res, max_topics=opts.max_topics) ranking.gather_leftover_tweets(res, lc) if res.topics and res.topics[-1].groups is None: deduper.groupify_topic(res.topics[-1], groups_by_tweet_id) for topic in res.topics: topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets]) for group in topic.groups: group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams) group.rest_htmls = [nice_tweet(t,q_toks,topic.label_ngrams) for t in group.rest] for topic in res.topics: topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True) if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) time_since_earliest = nice_timedelta(datetime.utcnow() - earliest) else: time_since_earliest = None if opts.format == 'pickle': # pickle.dumps(res) is 800k with dump/load = 100ms/60ms # trimmed json-like version is 150k with dump/load = 5ms/2ms. yield pickle.dumps(res) return if opts.format == 'json': topic_info = dict( (t.label, { 'label': t.label, 'nice_label': nice_label(t.label), 'tweet_ids': t.tweet_ids, 'groups': [{'head_html':g.head_html, 'rest_htmls':g.rest_htmls} for g in t.groups], 'query_refinement': ranking.query_refinement(opts.q, t), }) for t in res.topics) topic_list = [t.label for t in res.topics] results = {'topic_list':topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest,} yield simplejson.dumps(results) return if opts.format != 'dev': raise Exception("bad format") for topic in res.topics: topic.tweets_html = topic_group_html(topic.groups) bigass_topic_dict = dict((t.label, dict( label= t.label, tweets_html= t.tweets_html, tweet_ids= t.tweet_ids, )) for t in res.topics) yield page_header() yield form_area(opts) yield "<table><tr>" yield "<th>topics" if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) #latest = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) s= "for %d tweets" % len(lc.tweets_by_id) s+= " over the last %s" % nice_timedelta(datetime.utcnow() - earliest) yield " <small>%s</small>" % s yield "<th>tweets" yield "<tr><td valign=top id=topic_list>" topic_labels = ['''<span class="topic_label" onclick="topic_click(this)" topic_label="%s" >%s</span><small> %d, %d</small><br>''' % ( cgi.escape(topic.label), topic.label, topic.group_count, topic.tweet_count ) for topic in res.topics] for x in table_byrow(topic_labels, ncol=opts.ncol): yield x yield "<td valign=top>" yield "<div id=tweets>" yield "click on a topic on the left please" yield "</div>" yield "<div id=tweets_more>" yield "</div>" yield "</table>" yield "<script>" yield "topics = " yield simplejson.dumps(bigass_topic_dict) yield ";" yield "load_default_topic();" yield "</script>"