def add_tweet(self, tweet): self.tweets_by_id[tweet['id']] = tweet toks = tweet['toks'] self.model.info['big_n'] += len(toks) the_unigrams = set(bigrams.filtered_unigrams(toks)) tweet['unigrams'] = the_unigrams for unigram in the_unigrams: self.model.add(unigram) self.index[unigram].append(tweet) the_bigrams = set(bigrams.filtered_bigrams(toks)) tweet['bigrams'] = the_bigrams for bigram in the_bigrams: self.model.add(bigram) self.index[bigram].append(tweet) self.bigram_index[bigram[0], None].append(bigram) self.bigram_index[None, bigram[1]].append(bigram) tweet['trigrams'] = set(bigrams.filtered_trigrams(toks)) for trigram in tweet['trigrams']: self.model.add(trigram) self.index[trigram].append(tweet)
import sys sys.path.insert(0, '/usr2/corpora/tweets/tweetmotif') import twokenize, util, bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1] + m[2].replace(" ", "") + m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))
import sys sys.path.insert(0,'/usr2/corpora/tweets/tweetmotif') import twokenize,util,bigrams util.fix_stdio() from sane_re import * AposFix = _R(r"( |^)(' [stm])( |$)") for line in sys.stdin: parts = util.unicodify(line[:-1]).split("\t") text = parts[-1] toks = twokenize.simple_tokenize(text) toked = " ".join(toks) #print "\t".join(parts[:-1]) + "\t" + toked #try: AposFix.show_match(toked) #except: pass featstr = AposFix.gsub(toked, lambda m: m[1]+m[2].replace(" ","")+m[3]) featstr = featstr.lower() toks = featstr.split() feats = [ug[0] for ug in bigrams.filtered_unigrams(toks)] feats += ["_".join(ng) for ng in bigrams.filtered_bigrams(toks)] print "\t".join(parts[:-1]) + "\t" + util.unicodify(" ".join(feats))