Python TextIndex Examples, zope.index.text.textindex.TextIndex Python Examples

Example #1

0

Show file

File: usercatalog.py Project: Siyavula/emas.app

class UserCatalog(Persistent):

    implements(IUserCatalog)

    def __init__(self):
        self._index = TextIndex()
        self._regdate = FieldIndex()
        self._metadata = IOBTree()

    def index(self, user):
        ints = getUtility(IIntIds)  
        site = getSite()
        mtool = getToolByName(site, 'portal_membership')
        memberdata = mtool.getMemberById(user.getId())
        if memberdata is None:
            return
        memberid = ints.register(memberdata)
        text = "%s %s %s" % (memberdata.getUserName(),
                             memberdata.getProperty('fullname'),
                             memberdata.getProperty('email'))
        regdate = memberdata.getProperty('registrationdate')
        regdate = datetime.strptime(regdate.strftime("%Y-%m-%d"), "%Y-%m-%d")
        self._index.index_doc(memberid, text)
        self._regdate.index_doc(memberid, regdate)
        self._metadata[memberid] = {
            'username': memberdata.getUserName(),
            'fullname': memberdata.getProperty('fullname'),
            'email': memberdata.getProperty('email'),
            'registrationdate': memberdata.getProperty('registrationdate')
            }

    def unindex(self, member):
        ints = getUtility(IIntIds)  
        memberid = ints.register(member)
        self._index.unindex_doc(memberid)
        self._regdate.unindex_doc(memberid)

    def search(self, searchstring='', regdate=None):
        ints = getUtility(IIntIds)  
        site = getSite()
        mtool = getToolByName(site, 'portal_membership')
        if searchstring:
            res = self._index.apply(searchstring).keys()
        else:
            res = []
        if regdate:
            res2 = self._regdate.apply(regdate)
            # get the intersection between the two results
            memberids = []
            if searchstring:
                for e in res:
                    if e in res2:
                        memberids.append(e)
            memberids = res2
        else:
            memberids = res
        result = []
        for k in memberids:
            result.append(self._metadata[k])
        return result

Example #2

0

Show file

File: blocking.py Project: abhijith/dedupe

    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = doc
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)

Example #3

0

Show file

    def tfIdfBlock(self, data, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        for i, (record_id, doc) in enumerate(data, 1) :
            index_to_id[i] = record_id
            base_tokens[i] = doc
            index.index_doc(i, doc)

        canopies = (tfidf._createCanopies(index,
                                          base_tokens, 
                                          threshold, 
                                          field)
                    for threshold in self.tfidf_fields[field])

        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)

Example #4

0

Show file

File: tfidf.py Project: dwyerk/dedupe

    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

Example #5

0

Show file

File: blocking.py Project: davechan/dedupe

    def tfIdfBlock(self, data_1, data_2, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        splitter = Splitter()

        stop_word_remover = CustomStopWordRemover(self.stop_words[field])

        indices = {}
        for predicate in self.tfidf_fields[field] :
            indices[predicate] = TextIndex(Lexicon(splitter, stop_word_remover))
            indices[predicate].index = CosineIndex(indices[predicate].lexicon)
            pipeline = indices[predicate].lexicon._pipeline
            stringify = predicate.stringify

        index_to_id = {}
        base_tokens = {}

        i = 1

        for record_id, doc in data_1 :
            doc = stringify(doc)
            index_to_id[i] = record_id
            last = [doc]
            for each in pipeline :
                last = each.process(last)
            base_tokens[i] = ' OR '.join(last)
            i += 1

        for record_id, doc in data_2  :
            doc = stringify(doc)
            index_to_id[i] = record_id
            for index in indices.values() :
                index.index_doc(i, doc)
            i += 1

        for predicate in self.tfidf_fields[field] :
            logger.info("Canopy: %s", str(predicate))
            canopy = tfidf.makeCanopy(indices[predicate],
                                      base_tokens, 
                                      predicate.threshold)
            predicate.canopy = dict((index_to_id[k], index_to_id[v])
                                    for k, v
                                    in canopy.iteritems())

Example #6

0

Show file

File: blocking.py Project: krenzlin/dedupe

    def tfIdfBlock(self, data_1, data_2, field): 
        '''Creates TF/IDF canopy of a given set of data'''

        class CustomStopWordRemover(object):
            stop_words = self.stop_words[field].copy()

            def process(self, lst):
                return [w for w in lst if not w in self.stop_words]

        splitter = Splitter()

        index = TextIndex(Lexicon(splitter, CustomStopWordRemover()))

        index.index = CosineIndex(index.lexicon)

        index_to_id = {}
        base_tokens = {}

        i = 1

        for record_id, doc in data_1 :
            index_to_id[i] = record_id
            base_tokens[i] = splitter.process([doc])
            i += 1

        for record_id, doc in data_2  :
            index_to_id[i] = record_id
            index.index_doc(i, doc)
            i += 1

        canopies = [apply(tfidf._createCanopies,
                          (index,
                           base_tokens, 
                           threshold, 
                           field))
                    for threshold in self.tfidf_fields[field]]


        for canopy in canopies :
            key, index_canopy = canopy
            id_canopy = dict((index_to_id[k], index_to_id[v]) 
                             for k,v in index_canopy.iteritems())
            self.canopies[key] = defaultdict(str, id_canopy)

Example #7

0

Show file

File: tfidf.py Project: dwyerk/dedupe

    def __init__(self, field, stop_words=[]) :
        self.field = field
 
        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

Example #8

0

Show file

File: blocking.py Project: abhijith/dedupe

def stopWords(data) :
    index = TextIndex(Lexicon(Splitter()))

    for i, (_, doc) in enumerate(data, 1) :
        index.index_doc(i, doc)

    doc_freq = [(len(index.index._wordinfo[wid]), word) 
                for word, wid in index.lexicon.items()]

    doc_freq.sort(reverse=True)

    N = float(index.index.documentCount())
    threshold = int(max(1000, N * 0.05))

    stop_words = set([])

    for frequency, word in doc_freq :
        if frequency > threshold :
            stop_words.add(word)
        else :
            break

    return stop_words

Example #9

0

Show file

def stopWords(data) :
    index = TextIndex(Lexicon(Splitter()))

    for i, (_, doc) in enumerate(data, 1) :
        index.index_doc(i, doc)

    doc_freq = [(len(index.index._wordinfo[wid]), word) 
                for word, wid in index.lexicon.items()]

    doc_freq.sort(reverse=True)

    N = float(index.index.documentCount())
    threshold = int(max(1000, N * 0.05))

    stop_words = set([])

    for frequency, word in doc_freq :
        if frequency > threshold :
            stop_words.add(word)
        else :
            break

    return stop_words

Example #10

0

Show file

File: usercatalog.py Project: Siyavula/emas.app

 def __init__(self):
     self._index = TextIndex()
     self._regdate = FieldIndex()
     self._metadata = IOBTree()

Example #11

0

Show file

File: tfidf.py Project: dwyerk/dedupe

class TfIdfIndex(object):
    def __init__(self, field, stop_words=[]):
        self.field = field

        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

    def _hash(self, x):
        i = hash(x)
        return int(math.copysign(i % (2**31), i))

    def index(self, record_id, doc):
        i = self._hash(record_id)
        self._i_to_id[i] = record_id

        self._index.index_doc(i, doc)

    def unindex(self, record_id):
        i = self._hash(record_id)
        del self._i_to_id[i]
        self._index.unindex_doc(i)

    def search(self, doc, threshold=0):
        doc = self._stringify(doc)
        query_list = self._parseTerms(doc)
        query_list = [
            '"%s"' % (term, ) if term.upper() in ('OR', 'AND') else term
            for term in query_list
        ]
        query = ' OR '.join(query_list)

        if query:
            results = self._index.apply(query).byValue(threshold)
        else:
            results = []

        return [self._i_to_id[k] for _, k in results]

    def _stringify(self, doc):
        try:
            doc = u' '.join(u'_'.join(each.split() for each in doc))
        except TypeError:
            pass

        return doc

    def canopy(self, token_vector, threshold):
        canopies = {}
        seen = set([])
        corpus_ids = set(token_vector.keys())

        while corpus_ids:
            center_id = corpus_ids.pop()
            center_vector = token_vector[center_id]

            self.unindex(center_id)

            if not center_vector:
                continue

            candidates = self.search(center_vector, threshold)

            candidates = set(candidates)

            corpus_ids.difference_update(candidates)

            for candidate_id in candidates:
                canopies[candidate_id] = (center_id, )
                self.unindex(candidate_id)

            if candidates:
                canopies[center_id] = (center_id, )

        return canopies

Example #12

0

Show file

File: tfidf.py Project: dwyerk/dedupe

class TfIdfIndex(object) :
    def __init__(self, field, stop_words=[]) :
        self.field = field
 
        splitter = Splitter()
        stop_word_remover = CustomStopWordRemover(stop_words)
        operator_escaper = OperatorEscaper()
        lexicon = Lexicon(splitter, stop_word_remover, operator_escaper)

        self._index = TextIndex(lexicon)
        self._index.index = CosineIndex(self._index.lexicon)

        self._i_to_id = {}
        self._parseTerms = self._index.lexicon.parseTerms

    def _hash(self, x) :
        i = hash(x)
        return int(math.copysign(i % (2**31), i))
        

    def index(self, record_id, doc) :
        i = self._hash(record_id)
        self._i_to_id[i] = record_id

        self._index.index_doc(i, doc)

    def unindex(self, record_id) :
        i = self._hash(record_id)
        del self._i_to_id[i]
        self._index.unindex_doc(i)

    def search(self, doc, threshold=0) :
        doc = self._stringify(doc)
        query_list = self._parseTerms(doc)
        query_list = [
            '"%s"' % (term,) if term.upper() in ('OR', 'AND') else term
            for term in query_list]
        query = ' OR '.join(query_list)

        if query :
            results = self._index.apply(query).byValue(threshold)
        else :
            results = []

        return [self._i_to_id[k] 
                for  _, k in results]

    def _stringify(self, doc) :
        try :
            doc = u' '.join(u'_'.join(each.split() for each in doc))
        except TypeError :
            pass

        return doc

    def canopy(self, token_vector, threshold) :
        canopies = {}
        seen = set([])
        corpus_ids = set(token_vector.keys())

        while corpus_ids:
            center_id = corpus_ids.pop()
            center_vector = token_vector[center_id]

            self.unindex(center_id)
        
            if not center_vector :
                continue

            candidates = self.search(center_vector, threshold)
            
            candidates = set(candidates)

            corpus_ids.difference_update(candidates)

            for candidate_id in candidates :
                canopies[candidate_id] = (center_id,)
                self.unindex(candidate_id)

            if candidates :
                canopies[center_id] = (center_id,)

        return canopies