class UserCatalog(Persistent): implements(IUserCatalog) def __init__(self): self._index = TextIndex() self._regdate = FieldIndex() self._metadata = IOBTree() def index(self, user): ints = getUtility(IIntIds) site = getSite() mtool = getToolByName(site, 'portal_membership') memberdata = mtool.getMemberById(user.getId()) if memberdata is None: return memberid = ints.register(memberdata) text = "%s %s %s" % (memberdata.getUserName(), memberdata.getProperty('fullname'), memberdata.getProperty('email')) regdate = memberdata.getProperty('registrationdate') regdate = datetime.strptime(regdate.strftime("%Y-%m-%d"), "%Y-%m-%d") self._index.index_doc(memberid, text) self._regdate.index_doc(memberid, regdate) self._metadata[memberid] = { 'username': memberdata.getUserName(), 'fullname': memberdata.getProperty('fullname'), 'email': memberdata.getProperty('email'), 'registrationdate': memberdata.getProperty('registrationdate') } def unindex(self, member): ints = getUtility(IIntIds) memberid = ints.register(member) self._index.unindex_doc(memberid) self._regdate.unindex_doc(memberid) def search(self, searchstring='', regdate=None): ints = getUtility(IIntIds) site = getSite() mtool = getToolByName(site, 'portal_membership') if searchstring: res = self._index.apply(searchstring).keys() else: res = [] if regdate: res2 = self._regdate.apply(regdate) # get the intersection between the two results memberids = [] if searchstring: for e in res: if e in res2: memberids.append(e) memberids = res2 else: memberids = res result = [] for k in memberids: result.append(self._metadata[k]) return result
class TfIdfIndex(object): def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms def _hash(self, x): i = hash(x) return int(math.copysign(i % (2**31), i)) def index(self, record_id, doc): i = self._hash(record_id) self._i_to_id[i] = record_id self._index.index_doc(i, doc) def unindex(self, record_id): i = self._hash(record_id) del self._i_to_id[i] self._index.unindex_doc(i) def search(self, doc, threshold=0): doc = self._stringify(doc) query_list = self._parseTerms(doc) query_list = [ '"%s"' % (term, ) if term.upper() in ('OR', 'AND') else term for term in query_list ] query = ' OR '.join(query_list) if query: results = self._index.apply(query).byValue(threshold) else: results = [] return [self._i_to_id[k] for _, k in results] def _stringify(self, doc): try: doc = u' '.join(u'_'.join(each.split() for each in doc)) except TypeError: pass return doc def canopy(self, token_vector, threshold): canopies = {} seen = set([]) corpus_ids = set(token_vector.keys()) while corpus_ids: center_id = corpus_ids.pop() center_vector = token_vector[center_id] self.unindex(center_id) if not center_vector: continue candidates = self.search(center_vector, threshold) candidates = set(candidates) corpus_ids.difference_update(candidates) for candidate_id in candidates: canopies[candidate_id] = (center_id, ) self.unindex(candidate_id) if candidates: canopies[center_id] = (center_id, ) return canopies
class TfIdfIndex(object) : def __init__(self, field, stop_words=[]) : self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms def _hash(self, x) : i = hash(x) return int(math.copysign(i % (2**31), i)) def index(self, record_id, doc) : i = self._hash(record_id) self._i_to_id[i] = record_id self._index.index_doc(i, doc) def unindex(self, record_id) : i = self._hash(record_id) del self._i_to_id[i] self._index.unindex_doc(i) def search(self, doc, threshold=0) : doc = self._stringify(doc) query_list = self._parseTerms(doc) query_list = [ '"%s"' % (term,) if term.upper() in ('OR', 'AND') else term for term in query_list] query = ' OR '.join(query_list) if query : results = self._index.apply(query).byValue(threshold) else : results = [] return [self._i_to_id[k] for _, k in results] def _stringify(self, doc) : try : doc = u' '.join(u'_'.join(each.split() for each in doc)) except TypeError : pass return doc def canopy(self, token_vector, threshold) : canopies = {} seen = set([]) corpus_ids = set(token_vector.keys()) while corpus_ids: center_id = corpus_ids.pop() center_vector = token_vector[center_id] self.unindex(center_id) if not center_vector : continue candidates = self.search(center_vector, threshold) candidates = set(candidates) corpus_ids.difference_update(candidates) for candidate_id in candidates : canopies[candidate_id] = (center_id,) self.unindex(candidate_id) if candidates : canopies[center_id] = (center_id,) return canopies