Exemple #1
0
def rank_and_filter1(linkedcorpus, background_model, q, smoothing, n, **bla):
  q_toks = bigrams.tokenize_and_clean(q, alignments=False)
  q_toks = map(tok_norm, q_toks)
  q_toks_set = set(q_toks)
  stopwords = bigrams.stopwords - q_toks_set
  for ratio,ngram in linkedcorpus.model.compare_with_bg_model(background_model, n, min_count=3, smoothing_algorithm=smoothing):
    norm_ngram = [tok_norm(t) for t in ngram]
    if set(norm_ngram) <= q_toks_set:
      #print "reject query-subsumed", norm_ngram
      continue
    #if len(linkedcorpus.index[ngram]) <= 2: continue
    if len(norm_ngram)>1 and norm_ngram[-1] in stopwords: 
      #print "reject effective n-1gram", norm_ngram
      continue
    topic_label = " ".join(ngram)
    tweets = linkedcorpus.index[ngram]
    yield common.Topic(ngram=ngram, label=topic_label, tweets=tweets, ratio=ratio)
Exemple #2
0
def rank_and_filter1(linkedcorpus, background_model, q, smoothing, n, **bla):
    q_toks = bigrams.tokenize_and_clean(q, alignments=False)
    q_toks = map(tok_norm, q_toks)
    q_toks_set = set(q_toks)
    stopwords = bigrams.stopwords - q_toks_set
    for ratio, ngram in linkedcorpus.model.compare_with_bg_model(
            background_model, n, min_count=3, smoothing_algorithm=smoothing):
        norm_ngram = [tok_norm(t) for t in ngram]
        if set(norm_ngram) <= q_toks_set:
            #print "reject query-subsumed", norm_ngram
            continue
        #if len(linkedcorpus.index[ngram]) <= 2: continue
        if len(norm_ngram) > 1 and norm_ngram[-1] in stopwords:
            #print "reject effective n-1gram", norm_ngram
            continue
        topic_label = " ".join(ngram)
        tweets = linkedcorpus.index[ngram]
        yield common.Topic(ngram=ngram,
                           label=topic_label,
                           tweets=tweets,
                           ratio=ratio)
    toks.append("!END")
    ngram = list(ngram)
    K = len(ngram)
    matching_positions = [
        i for i in range(len(toks) - K) if ngram == toks[i:(i + K)]
    ]
    starts, ends = [], []
    for pos in matching_positions:
        starts.append(toks.alignments[pos])
        ends.append(toks.alignments[pos + K - 1] + len(toks[pos + K - 1]))
    return starts, ends


# def test_overlap():

if __name__ == '__main__':
    import bigrams
    for orig in [
            "Oracle to buy Sun? What's going to happen to MySQL? JRuby? Glassfish? Postgres seems like a no brainer",
            "# RT @rickwatson twittersphere blowing up over oracle buying sun, and with it #mysql. LAMP just became LAPP (replaced with postgres)",
    ]:
        toks = bigrams.tokenize_and_clean(orig)
        print highlight(toks, {
            ('to', 'buy'): ("[[", "]]"),
            ('buy', 'sun'): ("{{", "}}")
        })
        print highlight(toks, {
            ('blowing', ): ("[[", "]]"),
            ('blowing', 'up'): ("{{", "}}")
        })
Exemple #4
0
      ret += ngrams_and_tags[ngram][1]
  return "".join(ret)

def simple_highlight(toks, ngram, start="<b>", end="</b>"):
  return highlight(toks, {ngram: (start,end)})

def compute_highlight_alignments(toks, ngram):
  toks = copy(toks)
  toks.append("!END")
  ngram = list(ngram)
  K = len(ngram)
  matching_positions = [i for i in range(len(toks) - K) if ngram==toks[i:(i+K)]]
  starts, ends = [], []
  for pos in matching_positions:
    starts.append( toks.alignments[pos] )
    ends.append(   toks.alignments[pos+K-1] + len(toks[pos+K-1]) )
  return starts,ends

# def test_overlap():
  

if __name__=='__main__':
  import bigrams
  for orig in [
      "Oracle to buy Sun? What's going to happen to MySQL? JRuby? Glassfish? Postgres seems like a no brainer",
      "# RT @rickwatson twittersphere blowing up over oracle buying sun, and with it #mysql. LAMP just became LAPP (replaced with postgres)",
      ]:
    toks=bigrams.tokenize_and_clean(orig)
    print highlight(toks,{('to','buy'):("[[","]]"), ('buy','sun'):("{{","}}")}       )
    print highlight(toks,{('blowing',):("[[","]]"), ('blowing','up'):("{{","}}")}       )
Exemple #5
0
def the_app(environ, start_response):
    global_init()
    status = '200 OK'

    opts = Opts(
        environ,
        opt('q', default=''),
        opt('pages', default=2),
        opt('split', default=0),
        opt('simple', default=0),
        opt('max_topics', default=40),
        opt('ncol', default=3),
        opt('save', default=False),
        opt('load', default=False),
        opt('smoothing', default='lidstone'),
        opt('single_query', default=0),
        opt('format', default='dev'),
    )

    print "OPTIONS %s" % (opts, )

    response_headers = [('Content-type', 'text/html')]
    start_response(status, response_headers)

    if opts.single_query:
        # the requery
        opts2 = Opts(environ, opt('q', str), opt('topic_label', str),
                     opt('exclude', default=''))
        opts2.exclude = [int(x) for x in opts2.exclude.split()]
        for x in single_query(**opts2):
            yield x
        return

    lc = linkedcorpus.LinkedCorpus()
    tweets_file = 'saved_tweets/save_%s_tweets' % opts.q
    tweet_iter = search.cleaned_results(
        opts.q,
        pages=opts.pages,
        key_fn=search.user_and_text_identity,
        save=tweets_file if opts.save else None,
        load=tweets_file if opts.load else None)
    tweet_iter = deduper.merge_multitweets(tweet_iter)
    lc.fill_from_tweet_iter(tweet_iter)
    q_toks = bigrams.tokenize_and_clean(opts.q, True)
    res = ranking.extract_topics(lc, background_model, **opts)
    groups_by_tweet_id = deduper.dedupe(lc)
    for topic in res.topics:
        deduper.groupify_topic(topic, groups_by_tweet_id)
    ranking.late_topic_clean(res, max_topics=opts.max_topics)
    ranking.gather_leftover_tweets(res, lc)
    if res.topics and res.topics[-1].groups is None:
        deduper.groupify_topic(res.topics[-1], groups_by_tweet_id)
    for topic in res.topics:
        topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets])
        for group in topic.groups:
            group.head_html = nice_tweet(group.head, q_toks,
                                         topic.label_ngrams)
            group.rest_htmls = [
                nice_tweet(t, q_toks, topic.label_ngrams) for t in group.rest
            ]
    for topic in res.topics:
        topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True)
    if lc.tweets_by_id:
        earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        time_since_earliest = nice_timedelta(datetime.utcnow() - earliest)
    else:
        time_since_earliest = None

    if opts.format == 'pickle':
        # pickle.dumps(res) is 800k with dump/load = 100ms/60ms
        # trimmed json-like version is 150k with dump/load = 5ms/2ms.
        yield pickle.dumps(res)
        return
    if opts.format == 'json':
        topic_info = dict((t.label, {
            'label':
            t.label,
            'nice_label':
            nice_label(t.label),
            'tweet_ids':
            t.tweet_ids,
            'groups': [{
                'head_html': g.head_html,
                'rest_htmls': g.rest_htmls
            } for g in t.groups],
            'query_refinement':
            ranking.query_refinement(opts.q, t),
        }) for t in res.topics)
        topic_list = [t.label for t in res.topics]
        results = {
            'topic_list': topic_list,
            'topic_info': topic_info,
            'time_since_earliest': time_since_earliest,
        }
        yield simplejson.dumps(results)
        return
    if opts.format != 'dev': raise Exception("bad format")

    for topic in res.topics:
        topic.tweets_html = topic_group_html(topic.groups)
    bigass_topic_dict = dict((t.label,
                              dict(
                                  label=t.label,
                                  tweets_html=t.tweets_html,
                                  tweet_ids=t.tweet_ids,
                              )) for t in res.topics)

    yield page_header()
    yield form_area(opts)
    yield "<table><tr>"
    yield "<th>topics"
    if lc.tweets_by_id:
        earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        #latest   = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
        s = "for %d tweets" % len(lc.tweets_by_id)
        s += " over the last %s" % nice_timedelta(datetime.utcnow() - earliest)
        yield " <small>%s</small>" % s

    yield "<th>tweets"
    yield "<tr><td valign=top id=topic_list>"

    topic_labels = [
        '''<span class="topic_label" onclick="topic_click(this)" topic_label="%s"
  >%s</span><small>&nbsp;%d,&thinsp;%d</small><br>''' % (cgi.escape(
            topic.label), topic.label, topic.group_count, topic.tweet_count)
        for topic in res.topics
    ]
    for x in table_byrow(topic_labels, ncol=opts.ncol):
        yield x

    yield "<td valign=top>"
    yield "<div id=tweets>"
    yield "click on a topic on the left please"
    yield "</div>"
    yield "<div id=tweets_more>"
    yield "</div>"
    yield "</table>"
    yield "<script>"

    yield "topics = "
    yield simplejson.dumps(bigass_topic_dict)
    yield ";"
    yield "load_default_topic();"
    yield "</script>"
Exemple #6
0
def the_app(environ, start_response):
  global_init()
  status = '200 OK'

  opts = Opts(environ,
      opt('q', default=''),
      opt('pages', default=2),
      opt('split', default=0),
      opt('simple', default=0),
      opt('max_topics', default=40),
      opt('ncol', default=3),
      opt('save', default=False),
      opt('load', default=False),
      opt('smoothing', default='lidstone'),
      opt('single_query', default=0),
      opt('format', default='dev'),
      )

  print "OPTIONS %s" % (opts,)

  response_headers = [('Content-type','text/html')]
  start_response(status, response_headers)

  if opts.single_query:
    # the requery
    opts2 = Opts(environ, opt('q',str), opt('topic_label',str), opt('exclude',default=''))
    opts2.exclude = [int(x) for x in opts2.exclude.split()]
    for x in single_query(**opts2):
      yield x
    return

  lc = linkedcorpus.LinkedCorpus()
  tweets_file = 'saved_tweets/save_%s_tweets' % opts.q
  tweet_iter = search.cleaned_results(opts.q, 
      pages = opts.pages, 
      key_fn = search.user_and_text_identity, 
      save = tweets_file if opts.save else None,
      load = tweets_file if opts.load else None
  )
  tweet_iter = deduper.merge_multitweets(tweet_iter)
  lc.fill_from_tweet_iter(tweet_iter)
  q_toks = bigrams.tokenize_and_clean(opts.q, True)
  res = ranking.extract_topics(lc, background_model, **opts)
  groups_by_tweet_id = deduper.dedupe(lc)
  for topic in res.topics:
    deduper.groupify_topic(topic, groups_by_tweet_id)
  ranking.late_topic_clean(res, max_topics=opts.max_topics)
  ranking.gather_leftover_tweets(res, lc)
  if res.topics and res.topics[-1].groups is None:
    deduper.groupify_topic(res.topics[-1], groups_by_tweet_id)  
  for topic in res.topics:
    topic.tweet_ids = util.myjoin([tw['id'] for tw in topic.tweets])
    for group in topic.groups:
      group.head_html = nice_tweet(group.head, q_toks, topic.label_ngrams)
      group.rest_htmls = [nice_tweet(t,q_toks,topic.label_ngrams) for t in group.rest]
  for topic in res.topics:
    topic.groups.sort(key=lambda g: g.head['created_at'], reverse=True)
  if lc.tweets_by_id:
    earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    time_since_earliest = nice_timedelta(datetime.utcnow() - earliest)
  else:
    time_since_earliest = None
  
  if opts.format == 'pickle':
    # pickle.dumps(res) is 800k with dump/load = 100ms/60ms
    # trimmed json-like version is 150k with dump/load = 5ms/2ms.
    yield pickle.dumps(res)
    return
  if opts.format == 'json':
    topic_info = dict( (t.label,
       {
         'label': t.label,
         'nice_label': nice_label(t.label),
         'tweet_ids': t.tweet_ids,
         'groups': [{'head_html':g.head_html, 'rest_htmls':g.rest_htmls} for g in t.groups],
         'query_refinement': ranking.query_refinement(opts.q, t),
       })
        for t in res.topics)
    topic_list = [t.label for t in res.topics]
    results = {'topic_list':topic_list, 'topic_info': topic_info, 'time_since_earliest': time_since_earliest,}
    yield simplejson.dumps(results)
    return
  if opts.format != 'dev': raise Exception("bad format")
  
  for topic in res.topics:
    topic.tweets_html = topic_group_html(topic.groups)
  bigass_topic_dict = dict((t.label, dict(
    label= t.label, 
    tweets_html= t.tweets_html, 
    tweet_ids= t.tweet_ids,
  )) for t in res.topics)

  yield page_header()
  yield form_area(opts)  
  yield "<table><tr>"
  yield "<th>topics"
  if lc.tweets_by_id:
    earliest = min(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    #latest   = max(tw['created_at'] for tw in lc.tweets_by_id.itervalues())
    s=  "for %d tweets" % len(lc.tweets_by_id)
    s+= " over the last %s" % nice_timedelta(datetime.utcnow() - earliest)
    yield " <small>%s</small>" % s

  yield "<th>tweets"
  yield "<tr><td valign=top id=topic_list>"
  
  topic_labels = ['''<span class="topic_label" onclick="topic_click(this)" topic_label="%s"
  >%s</span><small>&nbsp;%d,&thinsp;%d</small><br>''' % (
    cgi.escape(topic.label), topic.label, topic.group_count, topic.tweet_count )
                  for topic in res.topics]
  for x in table_byrow(topic_labels, ncol=opts.ncol): yield x

  yield "<td valign=top>"
  yield "<div id=tweets>"
  yield "click on a topic on the left please"
  yield "</div>"
  yield "<div id=tweets_more>"
  yield "</div>"
  yield "</table>"
  yield "<script>"

  yield "topics = "
  yield simplejson.dumps(bigass_topic_dict)
  yield ";"
  yield "load_default_topic();"
  yield "</script>"