Ejemplo n.º 1
0
    def getPositions(self, docid, wordid):
        """ return a sequence of positions of occurrences of wordid within
            a document given by its docid.
        """

        encoded_wid = encode((wordid, ))
        encoded_document = self._doc2wid[docid].get()

        positions = IITreeSet()
        for pos, wid in enumerate(decode(encoded_document)):
            if wid == wordid:
                positions.insert(pos)
        return positions
Ejemplo n.º 2
0
    def getPositions(self, docid, wordid):
        """ return a sequence of positions of occurrences of wordid within
            a document given by its docid.
        """

        encoded_wid = encode((wordid,))
        encoded_document = self._doc2wid[docid].get()

        positions = IITreeSet()
        for pos, wid in enumerate(decode(encoded_document)):
            if wid == wordid:
                positions.insert(pos)
        return positions
Ejemplo n.º 3
0
    def hasContigousWordids(self, docid, wordids):
        # *The trick* to perform a phrase search is to use the feature
        # that the string encoded wids can be searched through string.find().
        # However string.find() is not sufficient since it might find occurences
        # where the next byte does not represent the start of a new word (with
        # 7th bit set). So we must loop search until we find a hit (and we don't
        # return on the first occurence anymore)

        encoded_wids = encode(wordids)
        encoded_wids_len = len(encoded_wids)
        encoded_document = self._doc2wid[docid].get()
        encoded_document_len = len(encoded_document)

        found = False
        offset = 0

        while 1:
            pos = encoded_document[offset:].find(encoded_wids)
            
            if pos == -1: # end of string?
                break

            if pos != -1: # found something

                if offset+pos+encoded_wids_len < encoded_document_len:
                    # check if the next token represents a new word (with
                    # 7th bit set)
                    next_c = encoded_document[offset+pos+encoded_wids_len]
                    if ord(next_c) > 127:
                        # start of a new word -> we *really* found a word
                        found = True
                        break
                else:
                    # we found a word and we are the end of the complete string                    
                    found = True
                    break

            offset = offset + pos + 1

        return found


        return encoded_wids in encoded_document
Ejemplo n.º 4
0
    def hasContigousWordids(self, docid, wordids):
        # *The trick* to perform a phrase search is to use the feature
        # that the string encoded wids can be searched through string.find().
        # However string.find() is not sufficient since it might find occurences
        # where the next byte does not represent the start of a new word (with
        # 7th bit set). So we must loop search until we find a hit (and we don't
        # return on the first occurence anymore)

        encoded_wids = encode(wordids)
        encoded_wids_len = len(encoded_wids)
        encoded_document = self._doc2wid[docid].get()
        encoded_document_len = len(encoded_document)

        found = False
        offset = 0

        while 1:
            pos = encoded_document[offset:].find(encoded_wids)

            if pos == -1:  # end of string?
                break

            if pos != -1:  # found something

                if offset + pos + encoded_wids_len < encoded_document_len:
                    # check if the next token represents a new word (with
                    # 7th bit set)
                    next_c = encoded_document[offset + pos + encoded_wids_len]
                    if ord(next_c) > 127:
                        # start of a new word -> we *really* found a word
                        found = True
                        break
                else:
                    # we found a word and we are the end of the complete string
                    found = True
                    break

            offset = offset + pos + 1

        return found

        return encoded_wids in encoded_document
Ejemplo n.º 5
0
    def insertDocument(self, docid, widlist):

        if not self._doc2wid.has_key(docid):
            self._length.change(1)

        enc_widlist = encode(widlist)
        old_enc_widlist = self._doc2wid.get(docid)
        if old_enc_widlist is not None:
            old_enc_widlist = old_enc_widlist.get()  # unwrap _PS instance

        removed_wordids = []
        if old_enc_widlist != enc_widlist:
            self._doc2wid[docid] = _PS(enc_widlist)
            if old_enc_widlist is not None:
                old_widlist = IISet(decode(old_enc_widlist))
                removed_wordids = difference(old_widlist, IISet(widlist))

        tree = self._wid2doc
        tree_has = tree.has_key
        count = 0
        for wid in widlist:
            count += 1
            if not tree_has(wid):
                tree[wid] = DocidList([docid])
            else:
                if not docid in tree[wid]:
                    tree[wid].insert(docid)

        for wid in removed_wordids:
            if tree_has(wid):
                try:
                    tree[wid].remove(docid)
                except KeyError:
                    pass

        self._docweight[docid] = count
Ejemplo n.º 6
0
    def insertDocument(self, docid, widlist):

        if not self._doc2wid.has_key(docid):
            self._length.change(1)

        enc_widlist = encode(widlist)
        old_enc_widlist = self._doc2wid.get(docid)
        if old_enc_widlist is not None:
            old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance

        removed_wordids = []
        if old_enc_widlist != enc_widlist :
            self._doc2wid[docid] = _PS(enc_widlist)
            if old_enc_widlist is not None:
                old_widlist = IISet(decode(old_enc_widlist))
                removed_wordids = difference(old_widlist, IISet(widlist))

        tree = self._wid2doc
        tree_has = tree.has_key
        count = 0
        for wid in widlist:
            count += 1
            if not tree_has(wid):
                tree[wid] = DocidList([docid])
            else:
                if not docid in tree[wid]:   
                    tree[wid].insert(docid)

        for wid in removed_wordids:
            if tree_has(wid):
                try:
                    tree[wid].remove(docid)
                except KeyError:
                    pass

        self._docweight[docid] = count