Example #1
0
class UserCatalog(Persistent):

    implements(IUserCatalog)

    def __init__(self):
        self._index = TextIndex()
        self._regdate = FieldIndex()
        self._metadata = IOBTree()

    def index(self, user):
        ints = getUtility(IIntIds)  
        site = getSite()
        mtool = getToolByName(site, 'portal_membership')
        memberdata = mtool.getMemberById(user.getId())
        if memberdata is None:
            return
        memberid = ints.register(memberdata)
        text = "%s %s %s" % (memberdata.getUserName(),
                             memberdata.getProperty('fullname'),
                             memberdata.getProperty('email'))
        regdate = memberdata.getProperty('registrationdate')
        regdate = datetime.strptime(regdate.strftime("%Y-%m-%d"), "%Y-%m-%d")
        self._index.index_doc(memberid, text)
        self._regdate.index_doc(memberid, regdate)
        self._metadata[memberid] = {
            'username': memberdata.getUserName(),
            'fullname': memberdata.getProperty('fullname'),
            'email': memberdata.getProperty('email'),
            'registrationdate': memberdata.getProperty('registrationdate')
            }

    def unindex(self, member):
        ints = getUtility(IIntIds)  
        memberid = ints.register(member)
        self._index.unindex_doc(memberid)
        self._regdate.unindex_doc(memberid)

    def search(self, searchstring='', regdate=None):
        ints = getUtility(IIntIds)  
        site = getSite()
        mtool = getToolByName(site, 'portal_membership')
        if searchstring:
            res = self._index.apply(searchstring).keys()
        else:
            res = []
        if regdate:
            res2 = self._regdate.apply(regdate)
            # get the intersection between the two results
            memberids = []
            if searchstring:
                for e in res:
                    if e in res2:
                        memberids.append(e)
            memberids = res2
        else:
            memberids = res
        result = []
        for k in memberids:
            result.append(self._metadata[k])
        return result
Example #2
0
class TfIdfIndex(object):
    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

    def _hash(self, x):
        i = hash(x)
        return int(math.copysign(i % (2**31), i))

    def index(self, record_id, doc):
        i = self._hash(record_id)
        self._i_to_id[i] = record_id

        self._index.index_doc(i, doc)

    def unindex(self, record_id):
        i = self._hash(record_id)
        del self._i_to_id[i]
        self._index.unindex_doc(i)

    def search(self, doc, threshold=0):
        doc = self._stringify(doc)
        query_list = self._parseTerms(doc)
        query_list = [
            '"%s"' % (term, ) if term.upper() in ('OR', 'AND') else term
            for term in query_list
        ]
        query = ' OR '.join(query_list)

        if query:
            results = self._index.apply(query).byValue(threshold)
        else:
            results = []

        return [self._i_to_id[k] for _, k in results]

    def _stringify(self, doc):
        try:
            doc = u' '.join(u'_'.join(each.split() for each in doc))
        except TypeError:
            pass

        return doc

    def canopy(self, token_vector, threshold):
        canopies = {}
        seen = set([])
        corpus_ids = set(token_vector.keys())

        while corpus_ids:
            center_id = corpus_ids.pop()
            center_vector = token_vector[center_id]

            self.unindex(center_id)

            if not center_vector:
                continue

            candidates = self.search(center_vector, threshold)

            candidates = set(candidates)

            corpus_ids.difference_update(candidates)

            for candidate_id in candidates:
                canopies[candidate_id] = (center_id, )
                self.unindex(candidate_id)

            if candidates:
                canopies[center_id] = (center_id, )

        return canopies
Example #3
0
class TfIdfIndex(object) :
    def __init__(self, field, stop_words=[]) :
        self.field = field
 
        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

    def _hash(self, x) :
        i = hash(x)
        return int(math.copysign(i % (2**31), i))
        

    def index(self, record_id, doc) :
        i = self._hash(record_id)
        self._i_to_id[i] = record_id

        self._index.index_doc(i, doc)

    def unindex(self, record_id) :
        i = self._hash(record_id)
        del self._i_to_id[i]
        self._index.unindex_doc(i)

    def search(self, doc, threshold=0) :
        doc = self._stringify(doc)
        query_list = self._parseTerms(doc)
        query_list = [
            '"%s"' % (term,) if term.upper() in ('OR', 'AND') else term
            for term in query_list]
        query = ' OR '.join(query_list)

        if query :
            results = self._index.apply(query).byValue(threshold)
        else :
            results = []

        return [self._i_to_id[k] 
                for  _, k in results]

    def _stringify(self, doc) :
        try :
            doc = u' '.join(u'_'.join(each.split() for each in doc))
        except TypeError :
            pass

        return doc

    def canopy(self, token_vector, threshold) :
        canopies = {}
        seen = set([])
        corpus_ids = set(token_vector.keys())

        while corpus_ids:
            center_id = corpus_ids.pop()
            center_vector = token_vector[center_id]

            self.unindex(center_id)
        
            if not center_vector :
                continue

            candidates = self.search(center_vector, threshold)
            
            candidates = set(candidates)

            corpus_ids.difference_update(candidates)

            for candidate_id in candidates :
                canopies[candidate_id] = (center_id,)
                self.unindex(candidate_id)

            if candidates :
                canopies[center_id] = (center_id,)

        return canopies