def process(self, session, data): # input should be: # {txt : {'text' : txt, 'occurences' : 1, 'proxLoc' : []}} # only thing we can push through is txt # output is as from tokenizer (eg to be merged) if not data: return data kw = {} first = data.popitem() prox = first[1].has_key('proxLoc') data[first[0]] = first[1] if type(data) == dict: for k in data.keys(): rdr = lucene.StringReader(data[k]['text']) res = self.analyzer.tokenStream('data', rdr) # can also get offset information from terms toks = [t.term() for t in res] kw[k] = {'text': toks, 'occurences': 1} if prox: kw[k]['proxLoc'] = data[k]['proxLoc'] return kw
def process_string(self, session, data): rdr = lucene.StringReader(data) toks = self.tokenizer(rdr) return zip([(t.term(), t.startOffset()) for t in toks])
def process_string(self, session, data): rdr = lucene.StringReader(data) toks = self.tokenizer(rdr) return [t.term() for t in toks]