if tweet['id'] in tok_cache: #print "CACHE HIT %s" % tweet['text'] toks = pickle.loads(tok_cache[tweet['id']]) else: #print "NEW ANALYSIS %s" % tweet['text'] toks = tokenize_and_clean(tweet['text'], alignments=True) tok_cache[tweet['id']] = pickle.dumps(toks) tweet['toks'] = toks from twokenize import regex_or mycompile = lambda pat: re.compile(pat, re.UNICODE) # junk tokens are a more aggressive cleaning assumption than usual. JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''') # dont make n-grams across phrase boundary markers. PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity) PhraseBoundaryTok = mycompile('^' + PhraseBoundaryTok + '$') EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$') def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds)
if tweet['id'] in tok_cache: #print "CACHE HIT %s" % tweet['text'] toks = pickle.loads(tok_cache[tweet['id']]) else: #print "NEW ANALYSIS %s" % tweet['text'] toks = tokenize_and_clean(tweet['text'], alignments=True) tok_cache[tweet['id']] = pickle.dumps(toks) tweet['toks'] = toks from twokenize import regex_or mycompile = lambda pat: re.compile(pat, re.UNICODE) # junk tokens are a more aggressive cleaning assumption than usual. JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''') # dont make n-grams across phrase boundary markers. PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity) PhraseBoundaryTok = mycompile('^'+PhraseBoundaryTok+'$') EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$') def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds) else:
new_word_counts = defaultdict(int) new_doc_counts = defaultdict(int) for w in vocab: r = replacements.get(w,w) new_vocab.add(r) new_word_counts[r] += word_counts[w] new_doc_counts[r] += doc_counts[w] return new_vocab, new_word_counts, new_doc_counts, replacements cur_user_words = [] last_username = None Punct = twokenize.regex_or(twokenize.PunctChars, twokenize.Entity, twokenize.EdgePunct, r'[\*]') Punct_RE = re.compile('^(%s)+$' % Punct, re.I|re.U) def get_tokens(text): toks = twokenize.tokenize(text.lower()) toks = [t for t in toks if not t.startswith('@') and t != 'rt' ] toks = ["-PUNCT-" if Punct_RE.search(t) and not emoticons.Emoticon_RE.search(t) else t for t in toks] # toks = [t.replace("#","") for t in toks] return toks # for line in sys.stdin: # # print get_tokens(line) # print (" ".join(get_tokens(line))).encode('utf-8') # # print "\n" + "\n".join(get_tokens(line)) # sys.exit(0)
import re # define our own slightly more limited version of the emoticon regex # adapted from twokenize.py # things we need to deal with separately our_bfLeft = u"(0|[oO]|[vV]|\\$|[tT]|[xX]|;|@|\\^|\\*)".encode('utf-8') our_basicface = "(?:" +our_bfLeft+tw.bfCenter+ ")|" +tw.s3+ "|" +tw.s4+ "|" +tw.s5 pattern = tw.regex_or( # myleott: Standard version :) :( :] :D :P "(?:>|)?" + tw.regex_or(tw.normalEyes, tw.wink) + tw.regex_or(tw.noseArea,"[Oo]") + tw.regex_or(tw.tongue+r"(?=\W|$|RT|rt|Rt)", tw.otherMouths+r"(?=\W|$|RT|rt|Rt)", tw.sadMouths, tw.happyMouths), # myleott: reversed version (: D: use positive lookbehind to remove "(word):" # myleott: because eyes on the right side is more ambiguous with the standard usage of : ; tw.regex_or("(?<=(?: ))", "(?<=(?:^))") + tw.regex_or(tw.sadMouths,tw.happyMouths,tw.otherMouths) + tw.noseArea + tw.regex_or(tw.normalEyes, tw.wink) + "(?:<|)?", our_basicface, # myleott: o.O and O.o are two of the biggest sources of differences # between this and the Java version. One little hack won't hurt... tw.oOEmote ) pattern = unicode(pattern).decode('utf-8') reg = re.compile(pattern, re.UNICODE) # print reg def get_emoticon_count(text): """ Get an approximate number of emoticons contained in the input text.