def rank_and_filter1(linkedcorpus, background_model, q, smoothing, n, **bla): q_toks = bigrams.tokenize_and_clean(q, alignments=False) q_toks = map(tok_norm, q_toks) q_toks_set = set(q_toks) stopwords = bigrams.stopwords - q_toks_set for ratio,ngram in linkedcorpus.model.compare_with_bg_model(background_model, n, min_count=3, smoothing_algorithm=smoothing): norm_ngram = [tok_norm(t) for t in ngram] if set(norm_ngram) <= q_toks_set: #print "reject query-subsumed", norm_ngram continue #if len(linkedcorpus.index[ngram]) <= 2: continue if len(norm_ngram)>1 and norm_ngram[-1] in stopwords: #print "reject effective n-1gram", norm_ngram continue topic_label = " ".join(ngram) tweets = linkedcorpus.index[ngram] yield common.Topic(ngram=ngram, label=topic_label, tweets=tweets, ratio=ratio)
def rank_and_filter1(linkedcorpus, background_model, q, smoothing, n, **bla): q_toks = bigrams.tokenize_and_clean(q, alignments=False) q_toks = map(tok_norm, q_toks) q_toks_set = set(q_toks) stopwords = bigrams.stopwords - q_toks_set for ratio, ngram in linkedcorpus.model.compare_with_bg_model( background_model, n, min_count=3, smoothing_algorithm=smoothing): norm_ngram = [tok_norm(t) for t in ngram] if set(norm_ngram) <= q_toks_set: #print "reject query-subsumed", norm_ngram continue #if len(linkedcorpus.index[ngram]) <= 2: continue if len(norm_ngram) > 1 and norm_ngram[-1] in stopwords: #print "reject effective n-1gram", norm_ngram continue topic_label = " ".join(ngram) tweets = linkedcorpus.index[ngram] yield common.Topic(ngram=ngram, label=topic_label, tweets=tweets, ratio=ratio)
toks.append("!END") ngram = list(ngram) K = len(ngram) matching_positions = [ i for i in range(len(toks) - K) if ngram == toks[i:(i + K)] ] starts, ends = [], [] for pos in matching_positions: starts.append(toks.alignments[pos]) ends.append(toks.alignments[pos + K - 1] + len(toks[pos + K - 1])) return starts, ends # def test_overlap(): if __name__ == '__main__': import bigrams for orig in [ "Oracle to buy Sun? What's going to happen to MySQL? JRuby? Glassfish? Postgres seems like a no brainer", "# RT @rickwatson twittersphere blowing up over oracle buying sun, and with it #mysql. LAMP just became LAPP (replaced with postgres)", ]: toks = bigrams.tokenize_and_clean(orig) print highlight(toks, { ('to', 'buy'): ("[[", "]]"), ('buy', 'sun'): ("{{", "}}") }) print highlight(toks, { ('blowing', ): ("[[", "]]"), ('blowing', 'up'): ("{{", "}}") })
ret += ngrams_and_tags[ngram][1] return "".join(ret) def simple_highlight(toks, ngram, start="<b>", end="</b>"): return highlight(toks, {ngram: (start,end)}) def compute_highlight_alignments(toks, ngram): toks = copy(toks) toks.append("!END") ngram = list(ngram) K = len(ngram) matching_positions = [i for i in range(len(toks) - K) if ngram==toks[i:(i+K)]] starts, ends = [], [] for pos in matching_positions: starts.append( toks.alignments[pos] ) ends.append( toks.alignments[pos+K-1] + len(toks[pos+K-1]) ) return starts,ends # def test_overlap(): if __name__=='__main__': import bigrams for orig in [ "Oracle to buy Sun? What's going to happen to MySQL? JRuby? Glassfish? Postgres seems like a no brainer", "# RT @rickwatson twittersphere blowing up over oracle buying sun, and with it #mysql. LAMP just became LAPP (replaced with postgres)", ]: toks=bigrams.tokenize_and_clean(orig) print highlight(toks,{('to','buy'):("[[","]]"), ('buy','sun'):("{{","}}")} ) print highlight(toks,{('blowing',):("[[","]]"), ('blowing','up'):("{{","}}")} )
def the_app(environ, start_response): global_init() status = '200 OK' opts = Opts( environ, opt('q', default=''), opt('pages', default=2), opt('split', default=0), opt('simple', default=0), opt('max_topics', default=40), opt('ncol', default=3), opt('save', default=False), opt('load', default=False), opt('smoothing', default='lidstone'), opt('single_query', default=0), opt('format', default='dev'), ) print "OPTIONS %s" % (opts, ) response_headers = [('Content-type', 'text/html')] start_response(status, response_headers) if opts.single_query: # the requery opts2 = Opts(environ, opt('q', str), opt('topic_label', str), opt('exclude', default='')) opts2.exclude = [int(x) for x in opts2.exclude.split()] for x in single_query(**opts2): yield x return lc = linkedcorpus.LinkedCorpus() tweets_file = 'saved_tweets/save_%s_tweets' % opts.q tweet_iter = search.cleaned_results( opts.q, pages=opts.pages, key_fn=search.user_and_text_identity, save=tweets_file if opts.save else None, load=tweets_file if opts.load else None) tweet_iter = deduper.merge_multitweets(tweet_iter) lc.fill_from_tweet_iter(tweet_iter) q_toks = bigrams.tokenize_and_clean(opts.q, True) res = ranking.extract_topics(lc, background_model, **opts) groups_by_tweet_id = deduper.dedupe(lc) for topic in res.topics: deduper.groupify_topic(topic, groups_by_tweet_id) ranking.late_topic_clean(res, max_topics=opts.max_topics) ranking.gather_leftover_tweets(res, lc) if res.topics and res.topics[-1].groups is None: deduper.groupify_topic(res.topics[-1], groups_by_tweet_id) for topic in res.topics: topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets]) for group in topic.groups: group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams) group.rest_htmls = [ nice_tweet(t, q_toks, topic.label_ngrams) for t in group.rest ] for topic in res.topics: topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True) if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) time_since_earliest = nice_timedelta(datetime.utcnow() - earliest) else: time_since_earliest = None if opts.format == 'pickle': # pickle.dumps(res) is 800k with dump/load = 100ms/60ms # trimmed json-like version is 150k with dump/load = 5ms/2ms. yield pickle.dumps(res) return if opts.format == 'json': topic_info = dict((t.label, { 'label': t.label, 'nice_label': nice_label(t.label), 'tweet_ids': t.tweet_ids, 'groups': [{ 'head_html': g.head_html, 'rest_htmls': g.rest_htmls } for g in t.groups], 'query_refinement': ranking.query_refinement(opts.q, t), }) for t in res.topics) topic_list = [t.label for t in res.topics] results = { 'topic_list': topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest, } yield simplejson.dumps(results) return if opts.format != 'dev': raise Exception("bad format") for topic in res.topics: topic.tweets_html = topic_group_html(topic.groups) bigass_topic_dict = dict((t.label, dict( label=t.label, tweets_html=t.tweets_html, tweet_ids=t.tweet_ids, )) for t in res.topics) yield page_header() yield form_area(opts) yield "<table><tr>" yield "<th>topics" if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) #latest = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) s = "for %d tweets" % len(lc.tweets_by_id) s += " over the last %s" % nice_timedelta(datetime.utcnow() - earliest) yield " <small>%s</small>" % s yield "<th>tweets" yield "<tr><td valign=top id=topic_list>" topic_labels = [ '''<span class="topic_label" onclick="topic_click(this)" topic_label="%s" >%s</span><small> %d, %d</small><br>''' % (cgi.escape( topic.label), topic.label, topic.group_count, topic.tweet_count) for topic in res.topics ] for x in table_byrow(topic_labels, ncol=opts.ncol): yield x yield "<td valign=top>" yield "<div id=tweets>" yield "click on a topic on the left please" yield "</div>" yield "<div id=tweets_more>" yield "</div>" yield "</table>" yield "<script>" yield "topics = " yield simplejson.dumps(bigass_topic_dict) yield ";" yield "load_default_topic();" yield "</script>"
def the_app(environ, start_response): global_init() status = '200 OK' opts = Opts(environ, opt('q', default=''), opt('pages', default=2), opt('split', default=0), opt('simple', default=0), opt('max_topics', default=40), opt('ncol', default=3), opt('save', default=False), opt('load', default=False), opt('smoothing', default='lidstone'), opt('single_query', default=0), opt('format', default='dev'), ) print "OPTIONS %s" % (opts,) response_headers = [('Content-type','text/html')] start_response(status, response_headers) if opts.single_query: # the requery opts2 = Opts(environ, opt('q',str), opt('topic_label',str), opt('exclude',default='')) opts2.exclude = [int(x) for x in opts2.exclude.split()] for x in single_query(**opts2): yield x return lc = linkedcorpus.LinkedCorpus() tweets_file = 'saved_tweets/save_%s_tweets' % opts.q tweet_iter = search.cleaned_results(opts.q, pages = opts.pages, key_fn = search.user_and_text_identity, save = tweets_file if opts.save else None, load = tweets_file if opts.load else None ) tweet_iter = deduper.merge_multitweets(tweet_iter) lc.fill_from_tweet_iter(tweet_iter) q_toks = bigrams.tokenize_and_clean(opts.q, True) res = ranking.extract_topics(lc, background_model, **opts) groups_by_tweet_id = deduper.dedupe(lc) for topic in res.topics: deduper.groupify_topic(topic, groups_by_tweet_id) ranking.late_topic_clean(res, max_topics=opts.max_topics) ranking.gather_leftover_tweets(res, lc) if res.topics and res.topics[-1].groups is None: deduper.groupify_topic(res.topics[-1], groups_by_tweet_id) for topic in res.topics: topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets]) for group in topic.groups: group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams) group.rest_htmls = [nice_tweet(t,q_toks,topic.label_ngrams) for t in group.rest] for topic in res.topics: topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True) if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) time_since_earliest = nice_timedelta(datetime.utcnow() - earliest) else: time_since_earliest = None if opts.format == 'pickle': # pickle.dumps(res) is 800k with dump/load = 100ms/60ms # trimmed json-like version is 150k with dump/load = 5ms/2ms. yield pickle.dumps(res) return if opts.format == 'json': topic_info = dict( (t.label, { 'label': t.label, 'nice_label': nice_label(t.label), 'tweet_ids': t.tweet_ids, 'groups': [{'head_html':g.head_html, 'rest_htmls':g.rest_htmls} for g in t.groups], 'query_refinement': ranking.query_refinement(opts.q, t), }) for t in res.topics) topic_list = [t.label for t in res.topics] results = {'topic_list':topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest,} yield simplejson.dumps(results) return if opts.format != 'dev': raise Exception("bad format") for topic in res.topics: topic.tweets_html = topic_group_html(topic.groups) bigass_topic_dict = dict((t.label, dict( label= t.label, tweets_html= t.tweets_html, tweet_ids= t.tweet_ids, )) for t in res.topics) yield page_header() yield form_area(opts) yield "<table><tr>" yield "<th>topics" if lc.tweets_by_id: earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) #latest = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues()) s= "for %d tweets" % len(lc.tweets_by_id) s+= " over the last %s" % nice_timedelta(datetime.utcnow() - earliest) yield " <small>%s</small>" % s yield "<th>tweets" yield "<tr><td valign=top id=topic_list>" topic_labels = ['''<span class="topic_label" onclick="topic_click(this)" topic_label="%s" >%s</span><small> %d, %d</small><br>''' % ( cgi.escape(topic.label), topic.label, topic.group_count, topic.tweet_count ) for topic in res.topics] for x in table_byrow(topic_labels, ncol=opts.ncol): yield x yield "<td valign=top>" yield "<div id=tweets>" yield "click on a topic on the left please" yield "</div>" yield "<div id=tweets_more>" yield "</div>" yield "</table>" yield "<script>" yield "topics = " yield simplejson.dumps(bigass_topic_dict) yield ";" yield "load_default_topic();" yield "</script>"