def getPositions(self, docid, wordid): """ return a sequence of positions of occurrences of wordid within a document given by its docid. """ encoded_wid = encode((wordid, )) encoded_document = self._doc2wid[docid].get() positions = IITreeSet() for pos, wid in enumerate(decode(encoded_document)): if wid == wordid: positions.insert(pos) return positions
def getPositions(self, docid, wordid): """ return a sequence of positions of occurrences of wordid within a document given by its docid. """ encoded_wid = encode((wordid,)) encoded_document = self._doc2wid[docid].get() positions = IITreeSet() for pos, wid in enumerate(decode(encoded_document)): if wid == wordid: positions.insert(pos) return positions
def hasContigousWordids(self, docid, wordids): # *The trick* to perform a phrase search is to use the feature # that the string encoded wids can be searched through string.find(). # However string.find() is not sufficient since it might find occurences # where the next byte does not represent the start of a new word (with # 7th bit set). So we must loop search until we find a hit (and we don't # return on the first occurence anymore) encoded_wids = encode(wordids) encoded_wids_len = len(encoded_wids) encoded_document = self._doc2wid[docid].get() encoded_document_len = len(encoded_document) found = False offset = 0 while 1: pos = encoded_document[offset:].find(encoded_wids) if pos == -1: # end of string? break if pos != -1: # found something if offset+pos+encoded_wids_len < encoded_document_len: # check if the next token represents a new word (with # 7th bit set) next_c = encoded_document[offset+pos+encoded_wids_len] if ord(next_c) > 127: # start of a new word -> we *really* found a word found = True break else: # we found a word and we are the end of the complete string found = True break offset = offset + pos + 1 return found return encoded_wids in encoded_document
def hasContigousWordids(self, docid, wordids): # *The trick* to perform a phrase search is to use the feature # that the string encoded wids can be searched through string.find(). # However string.find() is not sufficient since it might find occurences # where the next byte does not represent the start of a new word (with # 7th bit set). So we must loop search until we find a hit (and we don't # return on the first occurence anymore) encoded_wids = encode(wordids) encoded_wids_len = len(encoded_wids) encoded_document = self._doc2wid[docid].get() encoded_document_len = len(encoded_document) found = False offset = 0 while 1: pos = encoded_document[offset:].find(encoded_wids) if pos == -1: # end of string? break if pos != -1: # found something if offset + pos + encoded_wids_len < encoded_document_len: # check if the next token represents a new word (with # 7th bit set) next_c = encoded_document[offset + pos + encoded_wids_len] if ord(next_c) > 127: # start of a new word -> we *really* found a word found = True break else: # we found a word and we are the end of the complete string found = True break offset = offset + pos + 1 return found return encoded_wids in encoded_document
def insertDocument(self, docid, widlist): if not self._doc2wid.has_key(docid): self._length.change(1) enc_widlist = encode(widlist) old_enc_widlist = self._doc2wid.get(docid) if old_enc_widlist is not None: old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance removed_wordids = [] if old_enc_widlist != enc_widlist: self._doc2wid[docid] = _PS(enc_widlist) if old_enc_widlist is not None: old_widlist = IISet(decode(old_enc_widlist)) removed_wordids = difference(old_widlist, IISet(widlist)) tree = self._wid2doc tree_has = tree.has_key count = 0 for wid in widlist: count += 1 if not tree_has(wid): tree[wid] = DocidList([docid]) else: if not docid in tree[wid]: tree[wid].insert(docid) for wid in removed_wordids: if tree_has(wid): try: tree[wid].remove(docid) except KeyError: pass self._docweight[docid] = count
def insertDocument(self, docid, widlist): if not self._doc2wid.has_key(docid): self._length.change(1) enc_widlist = encode(widlist) old_enc_widlist = self._doc2wid.get(docid) if old_enc_widlist is not None: old_enc_widlist = old_enc_widlist.get() # unwrap _PS instance removed_wordids = [] if old_enc_widlist != enc_widlist : self._doc2wid[docid] = _PS(enc_widlist) if old_enc_widlist is not None: old_widlist = IISet(decode(old_enc_widlist)) removed_wordids = difference(old_widlist, IISet(widlist)) tree = self._wid2doc tree_has = tree.has_key count = 0 for wid in widlist: count += 1 if not tree_has(wid): tree[wid] = DocidList([docid]) else: if not docid in tree[wid]: tree[wid].insert(docid) for wid in removed_wordids: if tree_has(wid): try: tree[wid].remove(docid) except KeyError: pass self._docweight[docid] = count