class UserCatalog(Persistent): implements(IUserCatalog) def __init__(self): self._index = TextIndex() self._regdate = FieldIndex() self._metadata = IOBTree() def index(self, user): ints = getUtility(IIntIds) site = getSite() mtool = getToolByName(site, 'portal_membership') memberdata = mtool.getMemberById(user.getId()) if memberdata is None: return memberid = ints.register(memberdata) text = "%s %s %s" % (memberdata.getUserName(), memberdata.getProperty('fullname'), memberdata.getProperty('email')) regdate = memberdata.getProperty('registrationdate') regdate = datetime.strptime(regdate.strftime("%Y-%m-%d"), "%Y-%m-%d") self._index.index_doc(memberid, text) self._regdate.index_doc(memberid, regdate) self._metadata[memberid] = { 'username': memberdata.getUserName(), 'fullname': memberdata.getProperty('fullname'), 'email': memberdata.getProperty('email'), 'registrationdate': memberdata.getProperty('registrationdate') } def unindex(self, member): ints = getUtility(IIntIds) memberid = ints.register(member) self._index.unindex_doc(memberid) self._regdate.unindex_doc(memberid) def search(self, searchstring='', regdate=None): ints = getUtility(IIntIds) site = getSite() mtool = getToolByName(site, 'portal_membership') if searchstring: res = self._index.apply(searchstring).keys() else: res = [] if regdate: res2 = self._regdate.apply(regdate) # get the intersection between the two results memberids = [] if searchstring: for e in res: if e in res2: memberids.append(e) memberids = res2 else: memberids = res result = [] for k in memberids: result.append(self._metadata[k]) return result
def tfIdfBlock(self, data, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} for i, (record_id, doc) in enumerate(data, 1) : index_to_id[i] = record_id base_tokens[i] = doc index.index_doc(i, doc) canopies = (tfidf._createCanopies(index, base_tokens, threshold, field) for threshold in self.tfidf_fields[field]) for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms
def tfIdfBlock(self, data_1, data_2, field): '''Creates TF/IDF canopy of a given set of data''' splitter = Splitter() stop_word_remover = CustomStopWordRemover(self.stop_words[field]) indices = {} for predicate in self.tfidf_fields[field] : indices[predicate] = TextIndex(Lexicon(splitter, stop_word_remover)) indices[predicate].index = CosineIndex(indices[predicate].lexicon) pipeline = indices[predicate].lexicon._pipeline stringify = predicate.stringify index_to_id = {} base_tokens = {} i = 1 for record_id, doc in data_1 : doc = stringify(doc) index_to_id[i] = record_id last = [doc] for each in pipeline : last = each.process(last) base_tokens[i] = ' OR '.join(last) i += 1 for record_id, doc in data_2 : doc = stringify(doc) index_to_id[i] = record_id for index in indices.values() : index.index_doc(i, doc) i += 1 for predicate in self.tfidf_fields[field] : logger.info("Canopy: %s", str(predicate)) canopy = tfidf.makeCanopy(indices[predicate], base_tokens, predicate.threshold) predicate.canopy = dict((index_to_id[k], index_to_id[v]) for k, v in canopy.iteritems())
def tfIdfBlock(self, data_1, data_2, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] splitter = Splitter() index = TextIndex(Lexicon(splitter, CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} i = 1 for record_id, doc in data_1 : index_to_id[i] = record_id base_tokens[i] = splitter.process([doc]) i += 1 for record_id, doc in data_2 : index_to_id[i] = record_id index.index_doc(i, doc) i += 1 canopies = [apply(tfidf._createCanopies, (index, base_tokens, threshold, field)) for threshold in self.tfidf_fields[field]] for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def __init__(self, field, stop_words=[]) : self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms
def stopWords(data) : index = TextIndex(Lexicon(Splitter())) for i, (_, doc) in enumerate(data, 1) : index.index_doc(i, doc) doc_freq = [(len(index.index._wordinfo[wid]), word) for word, wid in index.lexicon.items()] doc_freq.sort(reverse=True) N = float(index.index.documentCount()) threshold = int(max(1000, N * 0.05)) stop_words = set([]) for frequency, word in doc_freq : if frequency > threshold : stop_words.add(word) else : break return stop_words
def __init__(self): self._index = TextIndex() self._regdate = FieldIndex() self._metadata = IOBTree()
class TfIdfIndex(object): def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms def _hash(self, x): i = hash(x) return int(math.copysign(i % (2**31), i)) def index(self, record_id, doc): i = self._hash(record_id) self._i_to_id[i] = record_id self._index.index_doc(i, doc) def unindex(self, record_id): i = self._hash(record_id) del self._i_to_id[i] self._index.unindex_doc(i) def search(self, doc, threshold=0): doc = self._stringify(doc) query_list = self._parseTerms(doc) query_list = [ '"%s"' % (term, ) if term.upper() in ('OR', 'AND') else term for term in query_list ] query = ' OR '.join(query_list) if query: results = self._index.apply(query).byValue(threshold) else: results = [] return [self._i_to_id[k] for _, k in results] def _stringify(self, doc): try: doc = u' '.join(u'_'.join(each.split() for each in doc)) except TypeError: pass return doc def canopy(self, token_vector, threshold): canopies = {} seen = set([]) corpus_ids = set(token_vector.keys()) while corpus_ids: center_id = corpus_ids.pop() center_vector = token_vector[center_id] self.unindex(center_id) if not center_vector: continue candidates = self.search(center_vector, threshold) candidates = set(candidates) corpus_ids.difference_update(candidates) for candidate_id in candidates: canopies[candidate_id] = (center_id, ) self.unindex(candidate_id) if candidates: canopies[center_id] = (center_id, ) return canopies
class TfIdfIndex(object) : def __init__(self, field, stop_words=[]) : self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms def _hash(self, x) : i = hash(x) return int(math.copysign(i % (2**31), i)) def index(self, record_id, doc) : i = self._hash(record_id) self._i_to_id[i] = record_id self._index.index_doc(i, doc) def unindex(self, record_id) : i = self._hash(record_id) del self._i_to_id[i] self._index.unindex_doc(i) def search(self, doc, threshold=0) : doc = self._stringify(doc) query_list = self._parseTerms(doc) query_list = [ '"%s"' % (term,) if term.upper() in ('OR', 'AND') else term for term in query_list] query = ' OR '.join(query_list) if query : results = self._index.apply(query).byValue(threshold) else : results = [] return [self._i_to_id[k] for _, k in results] def _stringify(self, doc) : try : doc = u' '.join(u'_'.join(each.split() for each in doc)) except TypeError : pass return doc def canopy(self, token_vector, threshold) : canopies = {} seen = set([]) corpus_ids = set(token_vector.keys()) while corpus_ids: center_id = corpus_ids.pop() center_vector = token_vector[center_id] self.unindex(center_id) if not center_vector : continue candidates = self.search(center_vector, threshold) candidates = set(candidates) corpus_ids.difference_update(candidates) for candidate_id in candidates : canopies[candidate_id] = (center_id,) self.unindex(candidate_id) if candidates : canopies[center_id] = (center_id,) return canopies