def _get_ids_shelve(self, cual): '''Return the ids index.''' fname = os.path.join(self._directory, "compindex-%02d.ids.bz2" % cual) fh = CompressedFile(fname, "rb") idx = cPickle.load(fh) fh.close() return idx
def __init__(self, directory): self._directory = directory # open the key shelve # Format: # ( matrix, docsets ) # matrix = TermSimilitudeMatrix # docsets = FrozenStringList keyfilename = os.path.join(directory, "compindex.key.bz2") fh = CompressedFile(keyfilename, "rb") matrix, docsets = cPickle.load(fh) fh.close() matrix = TermSimilitudeMatrix.unpickle(matrix) docsets = FrozenStringList.unpickle(docsets) self.matrix, self.docsets = matrix, docsets # see how many id files we have idsfilename = os.path.join(directory, "compindex-*.ids.bz2") filenames = [] for fn in os.listdir(directory): if fn.startswith("compindex-") and \ fn.endswith(".ids.bz2"): filenames.append(fn) self.idfiles_count = len(filenames)
def __init__(self, directory): self._directory = directory # open the key shelve keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "rb") self.key_shelf = cPickle.load(fh) fh.close() # see how many id files we have idsfilename = os.path.join(directory, "easyindex-*.ids.bz2") filenames = [] for fn in os.listdir(directory): if fn.startswith("easyindex-") and \ fn.endswith(".ids.bz2"): filenames.append(fn) self.idfiles_count = len(filenames)
def __init__(self, fname, verbose=False, manager=None): if os.path.exists(fname): self.fh = CompressedFile(fname, "rb") self.header_size = struct.unpack("<l", self.fh.read(4))[0] header_bytes = self.fh.read(self.header_size) self.header = pickle.loads(header_bytes) else: # no hace falta definir self.fh ni self.header_size porque no va # a llegar a usarlo porque nunca va a tener el item en el header self.header = {} self.verbose = verbose self.manager = manager
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_cnter += 1 ids_shelf[docid] = value # keys -> docid key_shelf.setdefault(key, set()).add(docid) # save key keyfilename = os.path.join(directory, "easyindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump(key_shelf, fh, 2) fh.close() # split ids_shelf in N dicts of about ~5k entries N = int(round(len(ids_shelf) / 5000.0)) if not N: N = 1 all_idshelves = [{} for i in range(N)] for k,v in ids_shelf.iteritems(): cual = utiles.coherent_hash(k) % N all_idshelves[cual][k] = v # save dict where corresponds for cual, shelf in enumerate(all_idshelves): fname = "easyindex-%03d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) fh.close() return indexed_counter
def crear(self, redirects, bloqNum, top_filenames, verbose=False): '''Genera el comprimido.''' if verbose: print "Procesando el bloque", bloqNum header = {} # Llenamos el header con archivos reales, con la pag como # clave, y la posición/tamaño como valor seek = 0 for dir3, filename in top_filenames: fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename) size = path.getsize(fullName) header[filename] = (seek, size) seek += size # Ponemos en el header también los redirects, apuntando en este caso # ael nombre de la página a la que se redirecciona for orig, dest in redirects: header[orig] = dest headerBytes = pickle.dumps(header) if verbose: print " archivos: %d seek total: %d largo header: %d" % ( len(top_filenames), seek, len(headerBytes)) # abro el archivo a comprimir nomfile = path.join(config.DIR_BLOQUES, "%08x.cdp" % bloqNum) if verbose: print " grabando en", nomfile f = CompressedFile(nomfile, "wb") # grabo la longitud del header, y el header f.write( struct.pack("<l", len(headerBytes) ) ) f.write( headerBytes ) # grabo cada uno de los articulos for dir3, filename in top_filenames: fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename) f.write(open( fullName, "rb" ).read())
def setup_function(self): self.data.representation = self.calc_mode # # Init arrays # # Prepare some array shapes gradient_params_shape = (len(self.gradient_param_names), ) if self.data.is_map: # speed up calculation by adding links # as nominal flux doesn't depend on the (outgoing) flavour self.data.link_containers('nu', [ 'nue_cc', 'numu_cc', 'nutau_cc', 'nue_nc', 'numu_nc', 'nutau_nc' ]) self.data.link_containers('nubar', [ 'nuebar_cc', 'numubar_cc', 'nutaubar_cc', 'nuebar_nc', 'numubar_nc', 'nutaubar_nc' ]) # Loop over containers for container in self.data: # Define shapes for containers # TODO maybe include toggles for nutau (only needed if prompt # considered) and for nu+nubar (only needed if nu->nubar # oscillations included) for better speed/memory performance # [ N events, 2 flavors in flux, nu vs nubar ] # SDB - reduced flavours to 2 (nue, numu) since nutau flux not # stored in MCEq splines flux_container_shape = (container.size, 2) gradients_shape = tuple( list(flux_container_shape) + list(gradient_params_shape)) container["nu_flux"] = np.full(flux_container_shape, np.NaN, dtype=FTYPE) container["gradients"] = np.full(gradients_shape, np.NaN, dtype=FTYPE) # Also create an array container to hold the gradient parameter values # Only want this once, e.g. not once per container self.gradient_params = np.empty(gradient_params_shape, dtype=FTYPE) # # Load MCEq splines # # Have splines for each Barr parameter, plus +/- versions of each # Barr parameter corresponding to mesons/antimesons. # For a given Barr parameter, an underlying dictionary have the following # keywords: "dnumu", "dnumubar", "dnue", dnuebar" # Units are changed to m^-2 in creates_splines.., rather than cm^2 which # is the unit of calculation in MCEq!!!! # Note that doing this all on CPUs, since the splines reside on the CPUs # The actual `compute_function` computation can be done on GPUs though # Load the MCEq splines spline_file = find_resource(self.table_file) logging.info("Loading MCEq spline tables from : %s", spline_file) # Encoding is to support pickle files created with python v2 self.spline_tables_dict = pickle.load(BZ2File(spline_file), encoding="latin1") # Ensure that the user is not loading an incompatible spline for bp in self.barr_param_names: bp_p = bp + '+' # meson bp_m = bp + '-' # antimeson assert bp_p in self.spline_tables_dict.keys(), ( "Gradient parameter '%s' missing from table" % bp_p) assert bp_m in self.spline_tables_dict.keys(), ( "Gradient parameter '%s' missing from table" % bp_m) # Loop over containers for container in self.data: # Grab containers here once to save time # TODO make spline generation script store splines directly in # terms of energy, not ln(energy) true_log_energy = np.log(container["true_energy"]) true_abs_coszen = np.abs(container["true_coszen"]) gradients = container["gradients"] nubar = container["nubar"] # # Flux gradients # # Evaluate splines to get the flux graidents w.r.t the Barr parameter values # Need to correctly map nu/nubar and flavor to the output arrays # Loop over parameters for ( gradient_param_name, gradient_param_idx, ) in self.gradient_param_indices.items(): # nue(bar) self._eval_spline( true_log_energy=true_log_energy, true_abs_coszen=true_abs_coszen, spline=self.spline_tables_dict[gradient_param_name] ["dnue" if nubar > 0 else "dnuebar"], out=gradients[:, 0, gradient_param_idx], ) # numu(bar) self._eval_spline( true_log_energy=true_log_energy, true_abs_coszen=true_abs_coszen, spline=self.spline_tables_dict[gradient_param_name] ["dnumu" if nubar > 0 else "dnumubar"], out=gradients[:, 1, gradient_param_idx], ) # nutau(bar) # TODO include nutau flux in splines # SDB - there is no nutau flux in splines ## gradients[:, 2, gradient_param_idx].fill(0.0) # Tell the smart arrays we've changed the flux gradient values on the host container.mark_changed("gradients") # don't forget to un-link everything again self.data.unlink_containers()
def testReadLineMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.readline, None) for line in self.TEXT_LINES * 5: self.assertEqual(bz2f.readline(), line)
class Comprimido(Bloque): """Un bloque de artículos. Este es un bloque en el que todo el archivo, header y datos por igual, va al disco comprimido con bz2. """ def __init__(self, fname, verbose=False, manager=None): if os.path.exists(fname): self.fh = CompressedFile(fname, "rb") self.header_size = struct.unpack("<l", self.fh.read(4))[0] header_bytes = self.fh.read(self.header_size) self.header = pickle.loads(header_bytes) else: # no hace falta definir self.fh ni self.header_size porque no va # a llegar a usarlo porque nunca va a tener el item en el header self.header = {} self.verbose = verbose self.manager = manager @classmethod def crear(self, redirects, bloqNum, top_filenames, verbose=False): '''Genera el comprimido.''' if verbose: print "Procesando el bloque", bloqNum header = {} # Llenamos el header con archivos reales, con la pag como # clave, y la posición/tamaño como valor seek = 0 for dir3, filename in top_filenames: fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename) size = path.getsize(fullName) header[filename] = (seek, size) seek += size # Ponemos en el header también los redirects, apuntando en este caso # ael nombre de la página a la que se redirecciona for orig, dest in redirects: header[orig] = dest headerBytes = pickle.dumps(header) if verbose: print " archivos: %d seek total: %d largo header: %d" % ( len(top_filenames), seek, len(headerBytes)) # abro el archivo a comprimir nomfile = path.join(config.DIR_BLOQUES, "%08x.cdp" % bloqNum) if verbose: print " grabando en", nomfile f = CompressedFile(nomfile, "wb") # grabo la longitud del header, y el header f.write( struct.pack("<l", len(headerBytes) ) ) f.write( headerBytes ) # grabo cada uno de los articulos for dir3, filename in top_filenames: fullName = path.join(config.DIR_PAGSLISTAS, dir3, filename) f.write(open( fullName, "rb" ).read())
def testRead0(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, None) self.assertEqual(bz2f.read(0), b"")
def testReadMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, None) self.assertEqual(bz2f.read(), self.TEXT * 5)
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Este programa demonstra a leitura de um pickle comprimido. ''' from bz2 import BZ2File from pickle import load from pprint import pprint bzip = BZ2File('temporeal_pickle.bz2','r') livros = load(bzip) bzip.close() print 'temporeal_pickle.bz2 lido' print len(livros), 'livros na lista' print '_' * 70 print 'O primeiro livro:' pprint(livros[0]) meio = len(livros)/2 print '_' * 70 print 'O livro do meio (#%s):' % meio pprint(livros[meio]) print '_' * 70 print 'O último livro:' pprint(livros[-1])
def testSeekForwardBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testSeekForward(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testWriteNonDefaultCompressLevel(self): expected = bz2.compress(self.TEXT, compresslevel=5) with BZ2File(self.filename, "w", compresslevel=5) as bz2f: bz2f.write(self.TEXT) with open(self.filename, "rb") as f: self.assertEqual(f.read(), expected)
def testWrite(self): with BZ2File(self.filename, "w") as bz2f: self.assertRaises(TypeError, bz2f.write) bz2f.write(self.TEXT) with open(self.filename, 'rb') as f: self.assertEqual(ext_decompress(f.read()), self.TEXT)
def testIteratorMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: self.assertEqual(list(iter(bz2f)), self.TEXT_LINES * 5)
def testIterator(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertEqual(list(iter(bz2f)), self.TEXT_LINES)
def testReadLines(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.readlines, None) self.assertEqual(bz2f.readlines(), self.TEXT_LINES)
def testOpenPathLikeFilename(self): filename = pathlib.Path(self.filename) with BZ2File(filename, "wb") as f: f.write(self.DATA) with BZ2File(filename, "rb") as f: self.assertEqual(f.read(), self.DATA)
def testReadBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT) self.assertFalse(bio.closed)
def testSeekForwardAcrossStreams(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.seek) bz2f.seek(len(self.TEXT) + 150) self.assertEqual(bz2f.read(), self.TEXT[150:])
def testSeekBackwardsBytesIO(self): with BytesIO(self.DATA) as bio: with BZ2File(bio) as bz2f: bz2f.read(500) bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
def testSeekBackwards(self): self.createTempFile() with BZ2File(self.filename) as bz2f: bz2f.read(500) bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[500 - 150:])
def testReadBadFile(self): self.createTempFile(streams=0, suffix=self.BAD_DATA) with BZ2File(self.filename) as bz2f: self.assertRaises(OSError, bz2f.read)
def testSeekBackwardsFromEnd(self): self.createTempFile() with BZ2File(self.filename) as bz2f: bz2f.seek(-150, 2) self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT) - 150:])
def testReadMultiStreamTrailingJunk(self): self.createTempFile(streams=5, suffix=self.BAD_DATA) with BZ2File(self.filename) as bz2f: self.assertEqual(bz2f.read(), self.TEXT * 5)
def testSeekBackwardsFromEndAcrossStreams(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: bz2f.seek(-1000, 2) self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:])
def testRead100(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertEqual(bz2f.read(100), self.TEXT[:100])
def _from_file(clazz, filename, header_only=False, strict=False): """ :param filename: name of the file to read from :type filename: string :param header_only: read header only :rtype: Graph :return: imported hypergraph """ num_edges = None num_verts = None is_dimacs = False stream = None graph = clazz() try: mtype = mimetypes.guess_type(filename)[1] if mtype is None: stream = open(filename, 'r') elif mtype == 'bzip2': stream = BZ2File(filename, 'r') elif mtype == 'gz' or mtype == 'gzip': stream = gzip.open(filename, 'r') elif mtype == 'xz' and xz: stream = xz.open(filename, 'r') else: raise IOError('Unknown input type "%s" for file "%s"' % (mtype, filename)) nr = 0 header_seen = False for line in stream: nr += 1 line = line.split() if line == [] or line[0] in ('x', 'n'): continue elif line[0] == 'p': if header_seen: logging.critical('L(%s). Duplicate header. Exiting.' % nr) exit(3) if len(line) > 4: logging.critical( 'L(%s). Too many arguments. Exiting.' % nr) exit(3) is_dimacs = line[1] == 'edge' is_formula = line[1] == 'cnf' num_verts = int(line[2]) num_edges = int(line[3]) if header_only: return num_verts, num_edges if num_verts == 0: logging.warning("Empty graph.") return graph header_seen = True elif line[0] != 'c' and ( is_dimacs or (line[0] != 'a' and line[0] != 'e') ): #now also ignores forAll and Exists :P if not header_seen: logging.critical( 'L(%s). Lines before header. Exiting.' % nr) exit(3) try: if is_dimacs: graph.add_edge(int(line[1]), int(line[2])) elif is_formula: atoms = list(map(lambda x: abs(int(x)), line[0:-1])) #print("formula{0}".format(atoms)) for i in atoms: for j in atoms: if i < j: graph.add_edge( i, j ) #abs -> then it also works for qbf num_edges += (len(atoms) * (len(atoms) - 1)) / 2 - 1 else: graph.add_edge(int(line[0]), int(line[1])) assert (0 not in graph.nodes()) except ValueError as e: logging.critical('L(%s). Invalid integer. Exiting.' % nr) logging.critical('Error was: %s' % e) exit(3) except IndexError as e: logging.critical('L(%s). Incomplete edge. Exiting' % nr) logging.critical('Error was: %s' % e) exit(3) clazz._parsed_file_line(graph, line) finally: if stream: stream.close() if graph.number_of_edges() > num_edges: logging.error("Edges overmuch: read=%s expected=%s" % (graph.number_of_edges(), num_edges)) exit(3) if strict and graph.number_of_edges() < num_edges: logging.error("Edges missing: read=%s expected=%s" % (graph.number_of_edges(), num_edges)) exit(3) if graph.number_of_nodes() > num_verts: logging.error("Vertices overmuch: read=%s expected=%s" % (graph.number_of_nodes(), num_verts)) #print(graph.nodes()) exit(3) if strict and graph.number_of_nodes() < num_verts: logging.error("Vertices missing: read=%s expected=%s" % (graph.number_of_nodes(), num_verts)) exit(3) return graph
def test_silently_closes(self): from bz2 import BZ2File self.create_broken_temp_file() BZ2File(self.temppath)
def testRead(self): self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, float()) self.assertEqual(bz2f.read(), self.TEXT)
def testSeekPostEndMultiStream(self): self.createTempFile(streams=5) with BZ2File(self.filename) as bz2f: bz2f.seek(150000) self.assertEqual(bz2f.tell(), len(self.TEXT) * 5) self.assertEqual(bz2f.read(), b"")
def test_read_broken_file(self): from bz2 import BZ2File self.create_broken_temp_file() bz2f = BZ2File(self.temppath) raises(EOFError, bz2f.read) del bz2f # delete from this frame, which is captured in the traceback
def testSeekPreStartMultiStream(self): self.createTempFile(streams=2) with BZ2File(self.filename) as bz2f: bz2f.seek(-150) self.assertEqual(bz2f.tell(), 0) self.assertEqual(bz2f.read(), self.TEXT * 2)
def testOpenDel(self): self.createTempFile() for i in range(10000): o = BZ2File(self.filename) del o
def create(cls, directory, source): '''Creates the index in the directory. The "source" generates pairs (key, value) to store in the index. The key must be a string, the value can be any hashable Python object. It must return the quantity of pairs indexed. ''' ids_shelf = {} key_shelf = {} ids_cnter = 0 tmp_reverse_id = {} indexed_counter = 0 # fill them for key, value in source: indexed_counter += 1 # process key if not isinstance(key, basestring): raise TypeError("The key must be string or unicode") if '\n' in key: raise ValueError("Key cannot contain newlines") # docid -> info final if value in tmp_reverse_id: docid = tmp_reverse_id[value] else: docid = ids_cnter tmp_reverse_id[value] = docid ids_shelf[docid] = value ids_cnter += 1 # keys -> docid if key in key_shelf: bucket = key_shelf[key] else: # Lets use array, it's more compact in memory, and given that it # should be easy for the caller to remove most repetitions, # it should only get very little overhead # # NOTE: right now, at most one repetition per property is sent # by cdpindex.py bucket = key_shelf[key] = array.array('l') bucket.append(docid) # prepare for serialization: # turn docsets into lists if delta-encoded integers (they're more compressible) print " Delta-encoding index buckets...", sys.stdout.flush() bucket_bytes = 0 bucket_entries = 0 bucket_maxentries = 0 for key, docset in key_shelf.iteritems(): key_shelf[key] = delta_encode(docset) bucket_entries += len(docset) bucket_bytes += len(key_shelf[key]) bucket_maxentries = max(bucket_maxentries, len(docset)) assert delta_decode(key_shelf[key]) == set(docset), \ ("Delta-encoding error", docset) print "done" # print statistics print " Index contains:" print " ", len(key_shelf), "terms" print " ", bucket_entries, "entries" print " ", len(ids_shelf), "documents" print print " ", len(key_shelf) // max(1,len(ids_shelf)), "terms on avg per documents" print print " Bucket bytes", bucket_bytes print " Bucket entries", bucket_entries print " Bucket maximum size", bucket_maxentries print " Avg bytes per entry", (float(bucket_bytes) / max(1,bucket_entries)) # save key # Format: # ( matrix, docsets ) # Putting all keys togeter makes them more compressible. # Sorting them (skeys) further helps. # Joining them in a single string avoids pickling overhead # (50% average with so many small strings) # And keeping them joined in memory (FrozenStringList) helps # avoid referencing overhead. sitems = sorted([ (k.encode("utf8"),v) for k,v in key_shelf.iteritems() ]) assert all('\n' not in k for k,v in sitems), \ "Terms cannot contain newlines" # free the big dict... eats up a lot del key_shelf print " Computing similitude matrix...", sys.stdout.flush() def progress_cb(p): print >> sys.stderr, "\r Computing similitude matrix... %d%%\t" % int(p), sys.stderr.flush() matrix = TermSimilitudeMatrix(map(operator.itemgetter(0), sitems), progress_callback = progress_cb) docsets = FrozenStringList(map(operator.itemgetter(1), sitems)) del sitems print "done" print " Saving:" keyfilename = os.path.join(directory, "compindex.key.bz2") fh = CompressedFile(keyfilename, "wb") cPickle.dump( (matrix.pickle(), docsets.pickle()), fh, 2) print " Uncompressed keystore bytes", fh.tell() fh.close() fh = open(keyfilename, "rb") fh.seek(0,2) print " Final keystore bytes", fh.tell() print fh.close() # split ids_shelf in N dicts of about ~16M pickled data each, # this helps get better compression ratios NB = sum( len(cPickle.dumps(item,2)) for item in ids_shelf.iteritems() ) print " Total docstore bytes", NB N = int((NB + DOCSTORE_BUCKET_SIZE/2) // DOCSTORE_BUCKET_SIZE) if not N: N = 1 print " Docstore buckets", N, "(", NB//N, " bytes per bucket)" all_idshelves = [{} for i in xrange(N)] for k,v in ids_shelf.iteritems(): cual = k % N all_idshelves[cual][k] = v # save dict where corresponds docucomp = 0 doccomp = 0 for cual, shelf in enumerate(all_idshelves): fname = "compindex-%02d.ids.bz2" % cual idsfilename = os.path.join(directory, fname) fh = CompressedFile(idsfilename, "wb") cPickle.dump(shelf, fh, 2) docucomp += fh.tell() fh.close() fh = open(idsfilename, "rb") fh.seek(0,2) doccomp += fh.tell() fh.close() print " Docstore uncompressed bytes", docucomp print " Docstore compressed bytes", doccomp print return indexed_counter
'datavalue'] and type(key_item['mainsnak']['datavalue'] ['value']) is str else '' datavalue_type = key_item['mainsnak']['datavalue'][ 'type'] if 'type' in key_item['mainsnak'][ 'datavalue'] else '' sql = "insert into claim VALUES(%s,%s,%s,%s)" params = (qid, pid, datavalue_value, datavalue_type) cur.execute(sql, params) conn.commit() # if i%10000==0: print("the %s th line insert claim table ok" % i) bz2_file_path = r'./latest-all.json.bz2' bz2_file = BZ2File(bz2_file_path) def main(): i = 1 count = 1 for line in bz2_file: line_str = line.decode() if count < 2: print("正在跳过第%s行" % count) count += 1 continue if len(line_str) > 2: json_object = json.loads(line_str[:-2]) insert_entity(i, json_object) # insert_property(i, json_object)