def __init__(self, directory):
        self._directory = directory

        # open the key shelve
        # Format:
        #   ( matrix, docsets )
        #   matrix = TermSimilitudeMatrix
        #   docsets = FrozenStringList
        keyfilename = os.path.join(directory, "compindex.key.bz2")
        fh = CompressedFile(keyfilename, "rb")
        matrix, docsets = cPickle.load(fh)
        fh.close()

        matrix = TermSimilitudeMatrix.unpickle(matrix)
        docsets = FrozenStringList.unpickle(docsets)

        self.matrix, self.docsets = matrix, docsets

        # see how many id files we have
        idsfilename = os.path.join(directory, "compindex-*.ids.bz2")
        filenames = []
        for fn in os.listdir(directory):
            if fn.startswith("compindex-") and \
                fn.endswith(".ids.bz2"):
                filenames.append(fn)
        self.idfiles_count = len(filenames)
 def _get_ids_shelve(self, cual):
     '''Return the ids index.'''
     fname = os.path.join(self._directory, "compindex-%02d.ids.bz2" % cual)
     fh = CompressedFile(fname, "rb")
     idx = cPickle.load(fh)
     fh.close()
     return idx
Beispiel #3
0
def decomp(fname):
    """Decompress a file and return a dict."""
    fh = CompressedFile(fname, "rb")
    # encoding needed for compatibility w/ py2 cPickle
    idx = pickle.load(fh, encoding="latin1")
    fh.close()
    return idx
Beispiel #4
0
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_cnter += 1
            ids_shelf[docid] = value

            # keys -> docid
            key_shelf.setdefault(key, set()).add(docid)

        # save key
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump(key_shelf, fh, 2)
        fh.close()

        # split ids_shelf in N dicts of about ~5k entries
        N = int(round(len(ids_shelf) / 5000.0))
        if not N:
            N = 1
        all_idshelves = [{} for i in range(N)]
        for k,v in ids_shelf.iteritems():
            cual = utiles.coherent_hash(k) % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        for cual, shelf in enumerate(all_idshelves):
            fname = "easyindex-%03d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            fh.close()

        return indexed_counter
Beispiel #5
0
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_cnter += 1
            ids_shelf[docid] = value

            # keys -> docid
            key_shelf.setdefault(key, set()).add(docid)

        # save key
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump(key_shelf, fh, 2)
        fh.close()

        # split ids_shelf in N dicts of about ~5k entries
        N = int(round(len(ids_shelf) / 5000.0))
        if not N:
            N = 1
        all_idshelves = [{} for i in range(N)]
        for k, v in ids_shelf.iteritems():
            cual = utiles.coherent_hash(k) % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        for cual, shelf in enumerate(all_idshelves):
            fname = "easyindex-%03d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            fh.close()

        return indexed_counter
Beispiel #6
0
    def __init__(self, directory):
        self._directory = directory

        # open the key shelve
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "rb")
        self.key_shelf = cPickle.load(fh)
        fh.close()

        # see how many id files we have
        filenames = []
        for fn in os.listdir(directory):
            if fn.startswith("easyindex-") and fn.endswith(".ids.bz2"):
                filenames.append(fn)
        self.idfiles_count = len(filenames)
Beispiel #7
0
    def __init__(self, directory):
        self._directory = directory

        # open the key shelve
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "rb")
        self.key_shelf = cPickle.load(fh)
        fh.close()

        # see how many id files we have
        idsfilename = os.path.join(directory, "easyindex-*.ids.bz2")
        filenames = []
        for fn in os.listdir(directory):
            if fn.startswith("easyindex-") and \
                fn.endswith(".ids.bz2"):
                filenames.append(fn)
        self.idfiles_count = len(filenames)
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")
            if '\n' in key:
                raise ValueError("Key cannot contain newlines")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_shelf[docid] = value
                ids_cnter += 1

            # keys -> docid
            if key in key_shelf:
                bucket = key_shelf[key]
            else:
                # Lets use array, it's more compact in memory, and given that it
                # should be easy for the caller to remove most repetitions,
                # it should only get very little overhead
                #
                # NOTE: right now, at most one repetition per property is sent
                # by cdpindex.py
                bucket = key_shelf[key] = array.array('l')
            bucket.append(docid)

        # prepare for serialization:
        # turn docsets into lists if delta-encoded integers (they're more compressible)
        print " Delta-encoding index buckets...",
        sys.stdout.flush()

        bucket_bytes = 0
        bucket_entries = 0
        bucket_maxentries = 0
        for key, docset in key_shelf.iteritems():
            key_shelf[key] = delta_encode(docset)
            bucket_entries += len(docset)
            bucket_bytes += len(key_shelf[key])
            bucket_maxentries = max(bucket_maxentries, len(docset))

            assert delta_decode(key_shelf[key]) == set(docset), \
                ("Delta-encoding error", docset)

        print "done"

        # print statistics

        print "  Index contains:"
        print "      ", len(key_shelf), "terms"
        print "      ", bucket_entries, "entries"
        print "      ", len(ids_shelf), "documents"
        print
        print "      ", len(key_shelf) // max(1,len(ids_shelf)), "terms on avg per documents"
        print
        print "  Bucket bytes", bucket_bytes
        print "  Bucket entries", bucket_entries
        print "  Bucket maximum size", bucket_maxentries
        print "  Avg bytes per entry", (float(bucket_bytes) / max(1,bucket_entries))

        # save key
        # Format:
        #   ( matrix, docsets )
        #   Putting all keys togeter makes them more compressible.
        #   Sorting them (skeys) further helps.
        #   Joining them in a single string avoids pickling overhead
        #       (50% average with so many small strings)
        #   And keeping them joined in memory (FrozenStringList) helps
        #   avoid referencing overhead.

        sitems = sorted([ (k.encode("utf8"),v)
                          for k,v in key_shelf.iteritems() ])
        assert all('\n' not in k for k,v in sitems), \
            "Terms cannot contain newlines"

        # free the big dict... eats up a lot
        del key_shelf

        print " Computing similitude matrix...",
        sys.stdout.flush()


        def progress_cb(p):
            print >> sys.stderr, "\r Computing similitude matrix...  %d%%\t" % int(p),
            sys.stderr.flush()

        matrix = TermSimilitudeMatrix(map(operator.itemgetter(0), sitems),
                progress_callback = progress_cb)
        docsets = FrozenStringList(map(operator.itemgetter(1), sitems))
        del sitems

        print "done"
        print " Saving:"

        keyfilename = os.path.join(directory, "compindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump( (matrix.pickle(), docsets.pickle()), fh, 2)
        print "  Uncompressed keystore bytes", fh.tell()
        fh.close()

        fh = open(keyfilename, "rb")
        fh.seek(0,2)
        print "  Final keystore bytes", fh.tell()
        print
        fh.close()

        # split ids_shelf in N dicts of about ~16M pickled data each,
        # this helps get better compression ratios
        NB = sum( len(cPickle.dumps(item,2)) for item in ids_shelf.iteritems() )
        print "  Total docstore bytes", NB

        N = int((NB + DOCSTORE_BUCKET_SIZE/2) // DOCSTORE_BUCKET_SIZE)
        if not N:
            N = 1
        print "  Docstore buckets", N, "(", NB//N, " bytes per bucket)"
        all_idshelves = [{} for i in xrange(N)]
        for k,v in ids_shelf.iteritems():
            cual = k % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        docucomp = 0
        doccomp = 0
        for cual, shelf in enumerate(all_idshelves):
            fname = "compindex-%02d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            docucomp += fh.tell()
            fh.close()

            fh = open(idsfilename, "rb")
            fh.seek(0,2)
            doccomp += fh.tell()
            fh.close()

        print "  Docstore uncompressed bytes", docucomp
        print "  Docstore compressed bytes", doccomp
        print

        return indexed_counter