def choose_multi_label(labels, lang_model): longest = util.argmax(labels, scorer=lambda ngram: len(ngram)) if len(longest) > 3: best = util.argmax(bigrams.trigrams(longest), lambda ng: lang_model.lidstone(ng)) best = (best, ) elif len(longest) == 3: best = longest best = (best, ) elif len(longest) <= 2: # this is kinda shitty set of them .. would rather want all possible skip n-grams (O(N^2) of them?) z = [(tuple(x), ) for x in labels ] + bigrams.bigrams(labels) + bigrams.trigrams(labels) assert z z = [x for x in z if len(util.flatten(x)) <= 3] # sum is too weird # lexicographic ordering of the top-ranked sublabels in the multilabel def scorer(ngrams): scores = [lang_model.lidstone(ng) for ng in ngrams] if len(scores) < 3: scores += [0] * (3 - len(scores)) scores.sort(reverse=True) # print "SCORE %-30s %s" % (scores, ngrams) return scores z.sort(key=scorer, reverse=True) # print "RANKING",z best = z[0] else: assert False return best
def _nice_tweet(tweet, q_toks, topic_ngrams): s = "" s += '<span class="text">' hl_spec = dict((ng, ('<span class="topic_hl">','</span>')) for ng in topic_ngrams) for qg in list(set(bigrams.bigrams(q_toks))) + list(set(bigrams.unigrams(q_toks))): if len(qg)==1 and qg[0] in bigrams.super_stopwords: continue if len(qg)==1 and any(qg[0] in ng for ng in topic_ngrams): continue if len(qg)>=2 and any(kmp.isSubseq(qg, ng) for ng in topic_ngrams): continue hl_spec[qg] = ('<span class="q_hl">','</span>') text = highlighter.highlight(tweet['toks'], hl_spec) text = linkify(text, klass='t') #text = twokenize.Url_RE.subn(r'<a class=t target=_blank href="\1">\1</a>', text)[0] #text = twokenize.AT_RE.subn(r'<a class=at target=_blank href="\1">\1</a> text = At.gsub(text, r'<a class="at" target="_blank" href="http://twitter.com/\2">@\2</a>') s += text s += "</span>" s += " " s += '<span class="authors">' if 'orig_tweets' in tweet: s += "%d authors:" % len(tweet['orig_tweets']) subtweets = tweet['orig_tweets'] else: subtweets = (tweet,) for subtweet in subtweets: user = subtweet['from_user'] link = "http://twitter.com/%s/status/%s" % (user, subtweet['id']) s += " " # calling encode() here makes NO SENSE AT ALL why do we need it? s += '<a class="m" target="_blank" href="%s">%s</a>' % (util.stringify(link), util.stringify(user)) s += '</span>' return s
def choose_multi_label(labels, lang_model): longest = util.argmax(labels, scorer=lambda ngram: len(ngram)) if len(longest) > 3: best = util.argmax(bigrams.trigrams(longest), lambda ng: lang_model.lidstone(ng)) best = (best,) elif len(longest) == 3: best = longest best = (best,) elif len(longest) <= 2: # this is kinda shitty set of them .. would rather want all possible skip n-grams (O(N^2) of them?) z = [(tuple(x),) for x in labels] + bigrams.bigrams(labels) + bigrams.trigrams(labels) assert z z = [x for x in z if len(util.flatten(x)) <= 3] # sum is too weird # lexicographic ordering of the top-ranked sublabels in the multilabel def scorer(ngrams): scores = [lang_model.lidstone(ng) for ng in ngrams] if len(scores) < 3: scores += [0]*(3 - len(scores)) scores.sort(reverse=True) # print "SCORE %-30s %s" % (scores, ngrams) return scores z.sort(key= scorer, reverse=True) # print "RANKING",z best = z[0] else: assert False return best
def _nice_tweet(tweet, q_toks, topic_ngrams): s = "" s += '<span class="text">' hl_spec = dict( (ng, ('<span class="topic_hl">', '</span>')) for ng in topic_ngrams) for qg in list(set(bigrams.bigrams(q_toks))) + list( set(bigrams.unigrams(q_toks))): if len(qg) == 1 and qg[0] in bigrams.super_stopwords: continue if len(qg) == 1 and any(qg[0] in ng for ng in topic_ngrams): continue if len(qg) >= 2 and any(kmp.isSubseq(qg, ng) for ng in topic_ngrams): continue hl_spec[qg] = ('<span class="q_hl">', '</span>') text = highlighter.highlight(tweet['toks'], hl_spec) text = linkify(text, klass='t') #text = twokenize.Url_RE.subn(r'<a class=t target=_blank href="\1">\1</a>', text)[0] #text = twokenize.AT_RE.subn(r'<a class=at target=_blank href="\1">\1</a> text = At.gsub( text, r'<a class="at" target="_blank" href="http://twitter.com/\2">@\2</a>') s += text s += "</span>" s += " " s += '<span class="authors">' if 'orig_tweets' in tweet: s += "%d authors:" % len(tweet['orig_tweets']) subtweets = tweet['orig_tweets'] else: subtweets = (tweet, ) for subtweet in subtweets: user = subtweet['from_user'] link = "http://twitter.com/%s/status/%s" % (user, subtweet['id']) s += " " # calling encode() here makes NO SENSE AT ALL why do we need it? s += '<a class="m" target="_blank" href="%s">%s</a>' % ( util.stringify(link), util.stringify(user)) s += '</span>' return s