def top(self, query, num): tok = Tokenizer() tok.tokenize(query) acc = Counter() for token in tok._counter: dict_entry = self.dict_entry(token) print token, dict_entry if not dict_entry: continue post_entries = self.post_entries(dict_entry[1], dict_entry[2]) # Adjust weights so that we don't open files multiple times in # cases of repeated tokens. (e.g: dog cat dog cat dog) for docid in post_entries: post_entries[docid] *= tok._counter[token] acc.update(post_entries) return acc.most_common(num), len(acc)
import sys from tok import Tokenizer tok = Tokenizer() for review in sys.stdin: print ' '.join(tok.tokenize(review))