예제 #1
0
 def _get_ids_shelve(self, cual):
     '''Return the ids index.'''
     fname = os.path.join(self._directory, "compindex-%02d.ids.bz2" % cual)
     fh = CompressedFile(fname, "rb")
     idx = cPickle.load(fh)
     fh.close()
     return idx
예제 #2
0
    def __init__(self, directory):
        self._directory = directory

        # open the key shelve
        # Format:
        #   ( matrix, docsets )
        #   matrix = TermSimilitudeMatrix
        #   docsets = FrozenStringList
        keyfilename = os.path.join(directory, "compindex.key.bz2")
        fh = CompressedFile(keyfilename, "rb")
        matrix, docsets = cPickle.load(fh)
        fh.close()

        matrix = TermSimilitudeMatrix.unpickle(matrix)
        docsets = FrozenStringList.unpickle(docsets)

        self.matrix, self.docsets = matrix, docsets

        # see how many id files we have
        idsfilename = os.path.join(directory, "compindex-*.ids.bz2")
        filenames = []
        for fn in os.listdir(directory):
            if fn.startswith("compindex-") and \
                fn.endswith(".ids.bz2"):
                filenames.append(fn)
        self.idfiles_count = len(filenames)
예제 #3
0
파일: easy_index.py 프로젝트: PyAr/CDPedia
    def __init__(self, directory):
        self._directory = directory

        # open the key shelve
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "rb")
        self.key_shelf = cPickle.load(fh)
        fh.close()

        # see how many id files we have
        idsfilename = os.path.join(directory, "easyindex-*.ids.bz2")
        filenames = []
        for fn in os.listdir(directory):
            if fn.startswith("easyindex-") and \
                fn.endswith(".ids.bz2"):
                filenames.append(fn)
        self.idfiles_count = len(filenames)
예제 #4
0
파일: compresor.py 프로젝트: PyAr/CDPedia
 def __init__(self, fname, verbose=False, manager=None):
     if os.path.exists(fname):
         self.fh = CompressedFile(fname, "rb")
         self.header_size = struct.unpack("<l", self.fh.read(4))[0]
         header_bytes = self.fh.read(self.header_size)
         self.header = pickle.loads(header_bytes)
     else:
         # no hace falta definir self.fh ni self.header_size porque no va
         # a llegar a usarlo porque nunca va a tener el item en el header
         self.header = {}
     self.verbose = verbose
     self.manager = manager
예제 #5
0
파일: easy_index.py 프로젝트: PyAr/CDPedia
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_cnter += 1
            ids_shelf[docid] = value

            # keys -> docid
            key_shelf.setdefault(key, set()).add(docid)

        # save key
        keyfilename = os.path.join(directory, "easyindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump(key_shelf, fh, 2)
        fh.close()

        # split ids_shelf in N dicts of about ~5k entries
        N = int(round(len(ids_shelf) / 5000.0))
        if not N:
            N = 1
        all_idshelves = [{} for i in range(N)]
        for k,v in ids_shelf.iteritems():
            cual = utiles.coherent_hash(k) % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        for cual, shelf in enumerate(all_idshelves):
            fname = "easyindex-%03d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            fh.close()

        return indexed_counter
예제 #6
0
파일: compresor.py 프로젝트: PyAr/CDPedia
    def crear(self, redirects, bloqNum, top_filenames, verbose=False):
        '''Genera el comprimido.'''
        if verbose:
            print "Procesando el bloque", bloqNum

        header = {}

        # Llenamos el header con archivos reales, con la pag como
        # clave, y la posición/tamaño como valor
        seek = 0
        for dir3, filename in top_filenames:
            fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename)
            size = path.getsize(fullName)
            header[filename] = (seek, size)
            seek += size

        # Ponemos en el header también los redirects, apuntando en este caso
        # ael nombre de la página a la que se redirecciona
        for orig, dest in redirects:
            header[orig] = dest

        headerBytes = pickle.dumps(header)
        if verbose:
            print "  archivos: %d   seek total: %d   largo header: %d" % (
                                    len(top_filenames), seek, len(headerBytes))

        # abro el archivo a comprimir
        nomfile = path.join(config.DIR_BLOQUES, "%08x.cdp" % bloqNum)
        if verbose:
            print "  grabando en", nomfile
        f = CompressedFile(nomfile, "wb")

        # grabo la longitud del header, y el header
        f.write( struct.pack("<l", len(headerBytes) ) )
        f.write( headerBytes )

        # grabo cada uno de los articulos
        for dir3, filename in top_filenames:
            fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename)
            f.write(open( fullName, "rb" ).read())
예제 #7
0
    def setup_function(self):

        self.data.representation = self.calc_mode

        #
        # Init arrays
        #

        # Prepare some array shapes
        gradient_params_shape = (len(self.gradient_param_names), )

        if self.data.is_map:
            # speed up calculation by adding links
            # as nominal flux doesn't depend on the (outgoing) flavour
            self.data.link_containers('nu', [
                'nue_cc', 'numu_cc', 'nutau_cc', 'nue_nc', 'numu_nc',
                'nutau_nc'
            ])

            self.data.link_containers('nubar', [
                'nuebar_cc', 'numubar_cc', 'nutaubar_cc', 'nuebar_nc',
                'numubar_nc', 'nutaubar_nc'
            ])

        # Loop over containers
        for container in self.data:

            # Define shapes for containers

            # TODO maybe include toggles for nutau (only needed if prompt
            # considered) and for nu+nubar (only needed if nu->nubar
            # oscillations included) for better speed/memory performance

            # [ N events, 2 flavors in flux, nu vs nubar ]
            # SDB - reduced flavours to 2 (nue, numu) since nutau flux not
            # stored in MCEq splines
            flux_container_shape = (container.size, 2)
            gradients_shape = tuple(
                list(flux_container_shape) + list(gradient_params_shape))

            container["nu_flux"] = np.full(flux_container_shape,
                                           np.NaN,
                                           dtype=FTYPE)
            container["gradients"] = np.full(gradients_shape,
                                             np.NaN,
                                             dtype=FTYPE)

        # Also create an array container to hold the gradient parameter values
        # Only want this once, e.g. not once per container
        self.gradient_params = np.empty(gradient_params_shape, dtype=FTYPE)

        #
        # Load MCEq splines
        #

        # Have splines for each Barr parameter, plus +/- versions of each
        # Barr parameter corresponding to mesons/antimesons.

        # For a given Barr parameter, an underlying dictionary have the following
        # keywords: "dnumu", "dnumubar", "dnue", dnuebar"

        # Units are changed to m^-2 in creates_splines.., rather than cm^2 which
        # is the unit of calculation in MCEq!!!!

        # Note that doing this all on CPUs, since the splines reside on the CPUs
        # The actual `compute_function` computation can be done on GPUs though

        # Load the MCEq splines
        spline_file = find_resource(self.table_file)
        logging.info("Loading MCEq spline tables from : %s", spline_file)
        # Encoding is to support pickle files created with python v2
        self.spline_tables_dict = pickle.load(BZ2File(spline_file),
                                              encoding="latin1")

        # Ensure that the user is not loading an incompatible spline
        for bp in self.barr_param_names:
            bp_p = bp + '+'  # meson
            bp_m = bp + '-'  # antimeson
            assert bp_p in self.spline_tables_dict.keys(), (
                "Gradient parameter '%s' missing from table" % bp_p)
            assert bp_m in self.spline_tables_dict.keys(), (
                "Gradient parameter '%s' missing from table" % bp_m)

        # Loop over containers
        for container in self.data:

            # Grab containers here once to save time
            # TODO make spline generation script store splines directly in
            # terms of energy, not ln(energy)
            true_log_energy = np.log(container["true_energy"])
            true_abs_coszen = np.abs(container["true_coszen"])
            gradients = container["gradients"]
            nubar = container["nubar"]

            #
            # Flux gradients
            #

            # Evaluate splines to get the flux graidents w.r.t the Barr parameter values
            # Need to correctly map nu/nubar and flavor to the output arrays

            # Loop over parameters
            for (
                    gradient_param_name,
                    gradient_param_idx,
            ) in self.gradient_param_indices.items():

                # nue(bar)
                self._eval_spline(
                    true_log_energy=true_log_energy,
                    true_abs_coszen=true_abs_coszen,
                    spline=self.spline_tables_dict[gradient_param_name]
                    ["dnue" if nubar > 0 else "dnuebar"],
                    out=gradients[:, 0, gradient_param_idx],
                )

                # numu(bar)
                self._eval_spline(
                    true_log_energy=true_log_energy,
                    true_abs_coszen=true_abs_coszen,
                    spline=self.spline_tables_dict[gradient_param_name]
                    ["dnumu" if nubar > 0 else "dnumubar"],
                    out=gradients[:, 1, gradient_param_idx],
                )

                # nutau(bar)
                # TODO include nutau flux in splines
                # SDB - there is no nutau flux in splines
                ## gradients[:, 2, gradient_param_idx].fill(0.0)

            # Tell the smart arrays we've changed the flux gradient values on the host
            container.mark_changed("gradients")

        # don't forget to un-link everything again
        self.data.unlink_containers()
예제 #8
0
 def testReadLineMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.readline, None)
         for line in self.TEXT_LINES * 5:
             self.assertEqual(bz2f.readline(), line)
예제 #9
0
파일: compresor.py 프로젝트: PyAr/CDPedia
class Comprimido(Bloque):
    """Un bloque de artículos.

    Este es un bloque en el que todo el archivo, header y datos por igual,
    va al disco comprimido con bz2.
    """

    def __init__(self, fname, verbose=False, manager=None):
        if os.path.exists(fname):
            self.fh = CompressedFile(fname, "rb")
            self.header_size = struct.unpack("<l", self.fh.read(4))[0]
            header_bytes = self.fh.read(self.header_size)
            self.header = pickle.loads(header_bytes)
        else:
            # no hace falta definir self.fh ni self.header_size porque no va
            # a llegar a usarlo porque nunca va a tener el item en el header
            self.header = {}
        self.verbose = verbose
        self.manager = manager

    @classmethod
    def crear(self, redirects, bloqNum, top_filenames, verbose=False):
        '''Genera el comprimido.'''
        if verbose:
            print "Procesando el bloque", bloqNum

        header = {}

        # Llenamos el header con archivos reales, con la pag como
        # clave, y la posición/tamaño como valor
        seek = 0
        for dir3, filename in top_filenames:
            fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename)
            size = path.getsize(fullName)
            header[filename] = (seek, size)
            seek += size

        # Ponemos en el header también los redirects, apuntando en este caso
        # ael nombre de la página a la que se redirecciona
        for orig, dest in redirects:
            header[orig] = dest

        headerBytes = pickle.dumps(header)
        if verbose:
            print "  archivos: %d   seek total: %d   largo header: %d" % (
                                    len(top_filenames), seek, len(headerBytes))

        # abro el archivo a comprimir
        nomfile = path.join(config.DIR_BLOQUES, "%08x.cdp" % bloqNum)
        if verbose:
            print "  grabando en", nomfile
        f = CompressedFile(nomfile, "wb")

        # grabo la longitud del header, y el header
        f.write( struct.pack("<l", len(headerBytes) ) )
        f.write( headerBytes )

        # grabo cada uno de los articulos
        for dir3, filename in top_filenames:
            fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename)
            f.write(open( fullName, "rb" ).read())
예제 #10
0
 def testRead0(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, None)
         self.assertEqual(bz2f.read(0), b"")
예제 #11
0
 def testReadMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, None)
         self.assertEqual(bz2f.read(), self.TEXT * 5)
예제 #12
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Este programa demonstra a leitura de um 
pickle comprimido.

'''

from bz2 import BZ2File
from pickle import load
from pprint import pprint

bzip = BZ2File('temporeal_pickle.bz2','r')
livros = load(bzip)
bzip.close()
print 'temporeal_pickle.bz2 lido'

print len(livros), 'livros na lista'

print '_' * 70
print 'O primeiro livro:'
pprint(livros[0])

meio = len(livros)/2
print '_' * 70
print 'O livro do meio (#%s):' % meio
pprint(livros[meio])

print '_' * 70
print 'O último livro:'
pprint(livros[-1])
예제 #13
0
 def testSeekForwardBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             self.assertRaises(TypeError, bz2f.seek)
             bz2f.seek(150)
             self.assertEqual(bz2f.read(), self.TEXT[150:])
예제 #14
0
 def testSeekForward(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.seek)
         bz2f.seek(150)
         self.assertEqual(bz2f.read(), self.TEXT[150:])
예제 #15
0
 def testWriteNonDefaultCompressLevel(self):
     expected = bz2.compress(self.TEXT, compresslevel=5)
     with BZ2File(self.filename, "w", compresslevel=5) as bz2f:
         bz2f.write(self.TEXT)
     with open(self.filename, "rb") as f:
         self.assertEqual(f.read(), expected)
예제 #16
0
 def testWrite(self):
     with BZ2File(self.filename, "w") as bz2f:
         self.assertRaises(TypeError, bz2f.write)
         bz2f.write(self.TEXT)
     with open(self.filename, 'rb') as f:
         self.assertEqual(ext_decompress(f.read()), self.TEXT)
예제 #17
0
 def testIteratorMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(list(iter(bz2f)), self.TEXT_LINES * 5)
예제 #18
0
 def testIterator(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(list(iter(bz2f)), self.TEXT_LINES)
예제 #19
0
 def testReadLines(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.readlines, None)
         self.assertEqual(bz2f.readlines(), self.TEXT_LINES)
예제 #20
0
 def testOpenPathLikeFilename(self):
     filename = pathlib.Path(self.filename)
     with BZ2File(filename, "wb") as f:
         f.write(self.DATA)
     with BZ2File(filename, "rb") as f:
         self.assertEqual(f.read(), self.DATA)
예제 #21
0
 def testReadBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             self.assertRaises(TypeError, bz2f.read, float())
             self.assertEqual(bz2f.read(), self.TEXT)
         self.assertFalse(bio.closed)
예제 #22
0
 def testSeekForwardAcrossStreams(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.seek)
         bz2f.seek(len(self.TEXT) + 150)
         self.assertEqual(bz2f.read(), self.TEXT[150:])
예제 #23
0
 def testSeekBackwardsBytesIO(self):
     with BytesIO(self.DATA) as bio:
         with BZ2File(bio) as bz2f:
             bz2f.read(500)
             bz2f.seek(-150, 1)
             self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
예제 #24
0
 def testSeekBackwards(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         bz2f.read(500)
         bz2f.seek(-150, 1)
         self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
예제 #25
0
 def testReadBadFile(self):
     self.createTempFile(streams=0, suffix=self.BAD_DATA)
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(OSError, bz2f.read)
예제 #26
0
 def testSeekBackwardsFromEnd(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-150, 2)
         self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT) - 150:])
예제 #27
0
 def testReadMultiStreamTrailingJunk(self):
     self.createTempFile(streams=5, suffix=self.BAD_DATA)
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(bz2f.read(), self.TEXT * 5)
예제 #28
0
 def testSeekBackwardsFromEndAcrossStreams(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-1000, 2)
         self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:])
예제 #29
0
 def testRead100(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertEqual(bz2f.read(100), self.TEXT[:100])
예제 #30
0
    def _from_file(clazz, filename, header_only=False, strict=False):
        """
        :param filename: name of the file to read from
        :type filename: string
        :param header_only: read header only
        :rtype: Graph
        :return: imported hypergraph
        """
        num_edges = None
        num_verts = None
        is_dimacs = False
        stream = None
        graph = clazz()
        try:
            mtype = mimetypes.guess_type(filename)[1]
            if mtype is None:
                stream = open(filename, 'r')
            elif mtype == 'bzip2':
                stream = BZ2File(filename, 'r')
            elif mtype == 'gz' or mtype == 'gzip':
                stream = gzip.open(filename, 'r')
            elif mtype == 'xz' and xz:
                stream = xz.open(filename, 'r')
            else:
                raise IOError('Unknown input type "%s" for file "%s"' %
                              (mtype, filename))
            nr = 0
            header_seen = False
            for line in stream:
                nr += 1
                line = line.split()
                if line == [] or line[0] in ('x', 'n'):
                    continue
                elif line[0] == 'p':
                    if header_seen:
                        logging.critical('L(%s). Duplicate header. Exiting.' %
                                         nr)
                        exit(3)
                    if len(line) > 4:
                        logging.critical(
                            'L(%s). Too many arguments. Exiting.' % nr)
                        exit(3)
                    is_dimacs = line[1] == 'edge'
                    is_formula = line[1] == 'cnf'
                    num_verts = int(line[2])
                    num_edges = int(line[3])
                    if header_only:
                        return num_verts, num_edges
                    if num_verts == 0:
                        logging.warning("Empty graph.")
                        return graph
                    header_seen = True
                elif line[0] != 'c' and (
                        is_dimacs or (line[0] != 'a' and line[0] != 'e')
                ):  #now also ignores forAll and Exists :P
                    if not header_seen:
                        logging.critical(
                            'L(%s). Lines before header. Exiting.' % nr)
                        exit(3)
                    try:
                        if is_dimacs:
                            graph.add_edge(int(line[1]), int(line[2]))
                        elif is_formula:
                            atoms = list(map(lambda x: abs(int(x)),
                                             line[0:-1]))
                            #print("formula{0}".format(atoms))
                            for i in atoms:
                                for j in atoms:
                                    if i < j:
                                        graph.add_edge(
                                            i, j
                                        )  #abs -> then it also works for qbf
                            num_edges += (len(atoms) *
                                          (len(atoms) - 1)) / 2 - 1
                        else:
                            graph.add_edge(int(line[0]), int(line[1]))
                        assert (0 not in graph.nodes())
                    except ValueError as e:
                        logging.critical('L(%s). Invalid integer. Exiting.' %
                                         nr)
                        logging.critical('Error was: %s' % e)
                        exit(3)
                    except IndexError as e:
                        logging.critical('L(%s). Incomplete edge. Exiting' %
                                         nr)
                        logging.critical('Error was: %s' % e)
                        exit(3)
                clazz._parsed_file_line(graph, line)
        finally:
            if stream:
                stream.close()

        if graph.number_of_edges() > num_edges:
            logging.error("Edges overmuch: read=%s expected=%s" %
                          (graph.number_of_edges(), num_edges))
            exit(3)
        if strict and graph.number_of_edges() < num_edges:
            logging.error("Edges missing: read=%s expected=%s" %
                          (graph.number_of_edges(), num_edges))
            exit(3)
        if graph.number_of_nodes() > num_verts:
            logging.error("Vertices overmuch: read=%s expected=%s" %
                          (graph.number_of_nodes(), num_verts))
            #print(graph.nodes())
            exit(3)
        if strict and graph.number_of_nodes() < num_verts:
            logging.error("Vertices missing: read=%s expected=%s" %
                          (graph.number_of_nodes(), num_verts))
            exit(3)
        return graph
예제 #31
0
 def test_silently_closes(self):
     from bz2 import BZ2File
     self.create_broken_temp_file()
     BZ2File(self.temppath)
예제 #32
0
 def testRead(self):
     self.createTempFile()
     with BZ2File(self.filename) as bz2f:
         self.assertRaises(TypeError, bz2f.read, float())
         self.assertEqual(bz2f.read(), self.TEXT)
예제 #33
0
 def testSeekPostEndMultiStream(self):
     self.createTempFile(streams=5)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(150000)
         self.assertEqual(bz2f.tell(), len(self.TEXT) * 5)
         self.assertEqual(bz2f.read(), b"")
예제 #34
0
 def test_read_broken_file(self):
     from bz2 import BZ2File
     self.create_broken_temp_file()
     bz2f = BZ2File(self.temppath)
     raises(EOFError, bz2f.read)
     del bz2f  # delete from this frame, which is captured in the traceback
예제 #35
0
 def testSeekPreStartMultiStream(self):
     self.createTempFile(streams=2)
     with BZ2File(self.filename) as bz2f:
         bz2f.seek(-150)
         self.assertEqual(bz2f.tell(), 0)
         self.assertEqual(bz2f.read(), self.TEXT * 2)
예제 #36
0
 def testOpenDel(self):
     self.createTempFile()
     for i in range(10000):
         o = BZ2File(self.filename)
         del o
예제 #37
0
    def create(cls, directory, source):
        '''Creates the index in the directory.

        The "source" generates pairs (key, value) to store in the index.  The
        key must be a string, the value can be any hashable Python object.

        It must return the quantity of pairs indexed.
        '''
        ids_shelf = {}
        key_shelf = {}
        ids_cnter = 0
        tmp_reverse_id = {}
        indexed_counter = 0

        # fill them
        for key, value in source:
            indexed_counter += 1

            # process key
            if not isinstance(key, basestring):
                raise TypeError("The key must be string or unicode")
            if '\n' in key:
                raise ValueError("Key cannot contain newlines")

            # docid -> info final
            if value in tmp_reverse_id:
                docid = tmp_reverse_id[value]
            else:
                docid = ids_cnter
                tmp_reverse_id[value] = docid
                ids_shelf[docid] = value
                ids_cnter += 1

            # keys -> docid
            if key in key_shelf:
                bucket = key_shelf[key]
            else:
                # Lets use array, it's more compact in memory, and given that it
                # should be easy for the caller to remove most repetitions,
                # it should only get very little overhead
                #
                # NOTE: right now, at most one repetition per property is sent
                # by cdpindex.py
                bucket = key_shelf[key] = array.array('l')
            bucket.append(docid)

        # prepare for serialization:
        # turn docsets into lists if delta-encoded integers (they're more compressible)
        print " Delta-encoding index buckets...",
        sys.stdout.flush()

        bucket_bytes = 0
        bucket_entries = 0
        bucket_maxentries = 0
        for key, docset in key_shelf.iteritems():
            key_shelf[key] = delta_encode(docset)
            bucket_entries += len(docset)
            bucket_bytes += len(key_shelf[key])
            bucket_maxentries = max(bucket_maxentries, len(docset))

            assert delta_decode(key_shelf[key]) == set(docset), \
                ("Delta-encoding error", docset)

        print "done"

        # print statistics

        print "  Index contains:"
        print "      ", len(key_shelf), "terms"
        print "      ", bucket_entries, "entries"
        print "      ", len(ids_shelf), "documents"
        print
        print "      ", len(key_shelf) // max(1,len(ids_shelf)), "terms on avg per documents"
        print
        print "  Bucket bytes", bucket_bytes
        print "  Bucket entries", bucket_entries
        print "  Bucket maximum size", bucket_maxentries
        print "  Avg bytes per entry", (float(bucket_bytes) / max(1,bucket_entries))

        # save key
        # Format:
        #   ( matrix, docsets )
        #   Putting all keys togeter makes them more compressible.
        #   Sorting them (skeys) further helps.
        #   Joining them in a single string avoids pickling overhead
        #       (50% average with so many small strings)
        #   And keeping them joined in memory (FrozenStringList) helps
        #   avoid referencing overhead.

        sitems = sorted([ (k.encode("utf8"),v)
                          for k,v in key_shelf.iteritems() ])
        assert all('\n' not in k for k,v in sitems), \
            "Terms cannot contain newlines"

        # free the big dict... eats up a lot
        del key_shelf

        print " Computing similitude matrix...",
        sys.stdout.flush()


        def progress_cb(p):
            print >> sys.stderr, "\r Computing similitude matrix...  %d%%\t" % int(p),
            sys.stderr.flush()

        matrix = TermSimilitudeMatrix(map(operator.itemgetter(0), sitems),
                progress_callback = progress_cb)
        docsets = FrozenStringList(map(operator.itemgetter(1), sitems))
        del sitems

        print "done"
        print " Saving:"

        keyfilename = os.path.join(directory, "compindex.key.bz2")
        fh = CompressedFile(keyfilename, "wb")
        cPickle.dump( (matrix.pickle(), docsets.pickle()), fh, 2)
        print "  Uncompressed keystore bytes", fh.tell()
        fh.close()

        fh = open(keyfilename, "rb")
        fh.seek(0,2)
        print "  Final keystore bytes", fh.tell()
        print
        fh.close()

        # split ids_shelf in N dicts of about ~16M pickled data each,
        # this helps get better compression ratios
        NB = sum( len(cPickle.dumps(item,2)) for item in ids_shelf.iteritems() )
        print "  Total docstore bytes", NB

        N = int((NB + DOCSTORE_BUCKET_SIZE/2) // DOCSTORE_BUCKET_SIZE)
        if not N:
            N = 1
        print "  Docstore buckets", N, "(", NB//N, " bytes per bucket)"
        all_idshelves = [{} for i in xrange(N)]
        for k,v in ids_shelf.iteritems():
            cual = k % N
            all_idshelves[cual][k] = v

        # save dict where corresponds
        docucomp = 0
        doccomp = 0
        for cual, shelf in enumerate(all_idshelves):
            fname = "compindex-%02d.ids.bz2" % cual
            idsfilename = os.path.join(directory, fname)
            fh = CompressedFile(idsfilename, "wb")
            cPickle.dump(shelf, fh, 2)
            docucomp += fh.tell()
            fh.close()

            fh = open(idsfilename, "rb")
            fh.seek(0,2)
            doccomp += fh.tell()
            fh.close()

        print "  Docstore uncompressed bytes", docucomp
        print "  Docstore compressed bytes", doccomp
        print

        return indexed_counter
예제 #38
0
                        'datavalue'] and type(key_item['mainsnak']['datavalue']
                                              ['value']) is str else ''
                datavalue_type = key_item['mainsnak']['datavalue'][
                    'type'] if 'type' in key_item['mainsnak'][
                        'datavalue'] else ''

    sql = "insert into claim VALUES(%s,%s,%s,%s)"
    params = (qid, pid, datavalue_value, datavalue_type)
    cur.execute(sql, params)
    conn.commit()
    # if i%10000==0:
    print("the %s th line insert claim table ok" % i)


bz2_file_path = r'./latest-all.json.bz2'
bz2_file = BZ2File(bz2_file_path)


def main():
    i = 1
    count = 1
    for line in bz2_file:
        line_str = line.decode()
        if count < 2:
            print("正在跳过第%s行" % count)
            count += 1
            continue
        if len(line_str) > 2:
            json_object = json.loads(line_str[:-2])
            insert_entity(i, json_object)
            # insert_property(i, json_object)