def append_entry(cls, config_reader, filepath, oldbinsha, newbinsha, message): """Append a new log entry to the revlog at filepath. :param config_reader: configuration reader of the repository - used to obtain user information. May be None :param filepath: full path to the log file :param oldbinsha: binary sha of the previous commit :param newbinsha: binary sha of the current commit :param message: message describing the change to the reference :param write: If True, the changes will be written right away. Otherwise the change will not be written :return: RefLogEntry objects which was appended to the log :note: As we are append-only, concurrent access is not a problem as we do not interfere with readers.""" if len(oldbinsha) != 20 or len(newbinsha) != 20: raise ValueError("Shas need to be given in binary format") #END handle sha type assure_directory_exists(filepath, is_file=True) entry = RefLogEntry((bin_to_hex(oldbinsha), bin_to_hex(newbinsha), Actor.committer(config_reader), (int(time.time()), time.altzone), message)) lf = LockFile(filepath) lf._obtain_lock_or_raise() fd = open(filepath, 'a') try: fd.write(repr(entry)) finally: fd.close() lf._release_lock() #END handle write operation return entry
def append_entry(cls, config_reader, filepath, oldbinsha, newbinsha, message): """Append a new log entry to the revlog at filepath. :param config_reader: configuration reader of the repository - used to obtain user information. May be None :param filepath: full path to the log file :param oldbinsha: binary sha of the previous commit :param newbinsha: binary sha of the current commit :param message: message describing the change to the reference :param write: If True, the changes will be written right away. Otherwise the change will not be written :return: RefLogEntry objects which was appended to the log :note: As we are append-only, concurrent access is not a problem as we do not interfere with readers.""" if len(oldbinsha) != 20 or len(newbinsha) != 20: raise ValueError("Shas need to be given in binary format") # END handle sha type assure_directory_exists(filepath, is_file=True) entry = RefLogEntry((bin_to_hex(oldbinsha), bin_to_hex(newbinsha), Actor.committer( config_reader), (int(time.time()), time.altzone), message)) lf = LockFile(filepath) lf._obtain_lock_or_raise() fd = open(filepath, 'a') try: fd.write(repr(entry)) finally: fd.close() lf._release_lock() # END handle write operation return entry
def test_basics(self): gdb = self.rorepo # partial to complete - works with everything hexsha = bin_to_hex(gdb.partial_to_complete_sha_hex("0.1.6")) assert len(hexsha) == 40 assert bin_to_hex(gdb.partial_to_complete_sha_hex(hexsha[:20])) == hexsha # fails with BadObject for invalid_rev in ("0000", "bad/ref", "super bad"): self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, invalid_rev)
def test_base(self): gdb = GitCmdObjectDB(osp.join(self.rorepo.git_dir, 'objects'), self.rorepo.git) # partial to complete - works with everything hexsha = bin_to_hex(gdb.partial_to_complete_sha_hex("0.1.6")) assert len(hexsha) == 40 assert bin_to_hex(gdb.partial_to_complete_sha_hex(hexsha[:20])) == hexsha # fails with BadObject for invalid_rev in ("0000", "bad/ref", "super bad"): self.assertRaises(BadObject, gdb.partial_to_complete_sha_hex, invalid_rev)
def append_entry(cls, config_reader: Union[Actor, 'GitConfigParser', 'SectionConstraint', None], filepath: PathLike, oldbinsha: bytes, newbinsha: bytes, message: str, write: bool = True) -> 'RefLogEntry': """Append a new log entry to the revlog at filepath. :param config_reader: configuration reader of the repository - used to obtain user information. May also be an Actor instance identifying the committer directly or None. :param filepath: full path to the log file :param oldbinsha: binary sha of the previous commit :param newbinsha: binary sha of the current commit :param message: message describing the change to the reference :param write: If True, the changes will be written right away. Otherwise the change will not be written :return: RefLogEntry objects which was appended to the log :note: As we are append-only, concurrent access is not a problem as we do not interfere with readers.""" if len(oldbinsha) != 20 or len(newbinsha) != 20: raise ValueError("Shas need to be given in binary format") # END handle sha type assure_directory_exists(filepath, is_file=True) first_line = message.split('\n')[0] if isinstance(config_reader, Actor): committer = config_reader # mypy thinks this is Actor | Gitconfigparser, but why? else: committer = Actor.committer(config_reader) entry = RefLogEntry((bin_to_hex(oldbinsha).decode('ascii'), bin_to_hex(newbinsha).decode('ascii'), committer, (int(_time.time()), _time.altzone), first_line)) if write: lf = LockFile(filepath) lf._obtain_lock_or_raise() fd = open(filepath, 'ab') try: fd.write(entry.format().encode(defenc)) finally: fd.close() lf._release_lock() # END handle write operation return entry
def test_reading(self): gdb = PureGitODB(os.path.join(rorepo_dir(), 'objects')) # we have packs and loose objects, alternates doesn't necessarily exist assert 1 < len(gdb.databases()) < 4 # access should be possible git_sha = hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") assert isinstance(gdb.info(git_sha), OInfo) assert isinstance(gdb.stream(git_sha), OStream) assert gdb.size() > 200 sha_list = list(gdb.sha_iter()) assert len(sha_list) == gdb.size() # This is actually a test for compound functionality, but it doesn't # have a separate test module # test partial shas # this one as uneven and quite short assert gdb.partial_to_complete_sha_hex('5aebcd') == hex_to_bin( "5aebcd5cb3340fb31776941d7e4d518a712a8655") # mix even/uneven hexshas for i, binsha in enumerate(sha_list[:50]): assert gdb.partial_to_complete_sha_hex( bin_to_hex(binsha)[:8 - (i % 2)]) == binsha # END for each sha self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000")
def test_reading(self): gdb = PureGitODB(os.path.join(rorepo_dir(), 'objects')) # we have packs and loose objects, alternates doesn't necessarily exist assert 1 < len(gdb.databases()) < 4 # access should be possible git_sha = hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") assert isinstance(gdb.info(git_sha), OInfo) assert isinstance(gdb.stream(git_sha), OStream) assert gdb.size() > 200 sha_list = list(gdb.sha_iter()) assert len(sha_list) == gdb.size() # This is actually a test for compound functionality, but it doesn't # have a separate test module # test partial shas # this one as uneven and quite short assert gdb.partial_to_complete_sha_hex('5aebcd') == hex_to_bin("5aebcd5cb3340fb31776941d7e4d518a712a8655") # mix even/uneven hexshas for i, binsha in enumerate(sha_list[:50]): assert gdb.partial_to_complete_sha_hex(bin_to_hex(binsha)[:8-(i%2)]) == binsha # END for each sha self.failUnlessRaises(BadObject, gdb.partial_to_complete_sha_hex, "0000")
def test_large_data_streaming(self, rwrepo): # TODO: This part overlaps with the same file in git.test.performance.test_stream # It should be shared if possible objects_path = rwrepo.db_path('') ldb = self.LooseODBCls(objects_path) for randomize in range(2): desc = (randomize and 'random ') or '' print >> sys.stderr, "Creating %s data ..." % desc st = time() size, stream = make_memory_file(self.large_data_size_bytes, randomize) elapsed = time() - st print >> sys.stderr, "Done (in %f s)" % elapsed # writing - due to the compression it will seem faster than it is st = time() binsha = ldb.store(IStream('blob', size, stream)).binsha elapsed_add = time() - st assert ldb.has_object(binsha) hexsha = bin_to_hex(binsha) db_file = os.path.join(objects_path, hexsha[:2], hexsha[2:]) fsize_kib = os.path.getsize(db_file) / 1000 size_kib = size / 1000 print >> sys.stderr, "%s: Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" % ( self.LooseODBCls.__name__, size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) # reading all at once st = time() ostream = ldb.stream(binsha) shadata = ostream.read() elapsed_readall = time() - st stream.seek(0) assert shadata == stream.getvalue() print >> sys.stderr, "%s: Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" % ( self.LooseODBCls.__name__, size_kib, desc, elapsed_readall, size_kib / elapsed_readall) # reading in chunks of 1 MiB cs = 512 * 1000 chunks = list() st = time() ostream = ldb.stream(binsha) while True: data = ostream.read(cs) chunks.append(data) if len(data) < cs: break # END read in chunks elapsed_readchunks = time() - st stream.seek(0) assert ''.join(chunks) == stream.getvalue() cs_kib = cs / 1000 print >> sys.stderr, "%s: Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % ( self.LooseODBCls.__name__, size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks) # del db file so git has something to do os.remove(db_file)
def _assert_index_entries(self, entries, trees): index = IndexFile.from_tree( self.rorepo, *[self.rorepo.tree(bin_to_hex(t)) for t in trees]) assert entries assert len(index.entries) == len(entries) for entry in entries: assert (entry.path, entry.stage) in index.entries
def short_to_long(odb, hexsha): """:return: long hexadecimal sha1 from the given less-than-40 byte hexsha or None if no candidate could be found. :param hexsha: hexsha with less than 40 byte""" try: return bin_to_hex(odb.partial_to_complete_sha_hex(hexsha)) except BadObject: return None
def stream(self, sha): """For now, all lookup is done by git itself :note: As we don't know when the stream is actually read (and if it is stored for later use) we read the data rigth away and cache it. This has HUGE performance implication, both for memory as for reading/deserializing objects, but we have no other choice in order to make the database behaviour consistent with other implementations !""" hexsha, typename, size, data = self._git.get_object_data(bin_to_hex(sha)) return OStream(hex_to_bin(hexsha), typename, size, StringIO(data))
def test_basics(self, path): ldb = PureLooseObjectODB(path) # write data self._assert_object_writing(ldb) self._assert_object_writing_async(ldb) # verify sha iteration and size shas = list(ldb.sha_iter()) assert shas and len(shas[0]) == 20 assert len(shas) == ldb.size() # verify find short object long_sha = bin_to_hex(shas[-1]) for short_sha in (long_sha[:20], long_sha[:5]): assert bin_to_hex(ldb.partial_to_complete_sha_hex(short_sha)) == long_sha # END for each sha self.failUnlessRaises(BadObject, ldb.partial_to_complete_sha_hex, '0000')
def test_basics(self, path): ldb = PureLooseObjectODB(path) # write data self._assert_object_writing(ldb) self._assert_object_writing_async(ldb) # verify sha iteration and size shas = list(ldb.sha_iter()) assert shas and len(shas[0]) == 20 assert len(shas) == ldb.size() # verify find short object long_sha = bin_to_hex(shas[-1]) for short_sha in (long_sha[:20], long_sha[:5]): assert bin_to_hex( ldb.partial_to_complete_sha_hex(short_sha)) == long_sha # END for each sha self.failUnlessRaises(BadObject, ldb.partial_to_complete_sha_hex, '0000')
def partial_to_complete_sha_hex(self, partial_hexsha): """:return: 20 byte binary sha1 string which matches the given name uniquely :param name: hexadecimal partial name :raise AmbiguousObjectName: :raise BadObject: """ candidate = None for binsha in self.sha_iter(): if bin_to_hex(binsha).startswith(partial_hexsha): # it can't ever find the same object twice if candidate is not None: raise AmbiguousObjectName(partial_hexsha) candidate = binsha # END for each object if candidate is None: raise BadObject(partial_hexsha) return candidate
def _map_loose_object(self, sha): """ :return: memory map of that file to allow random read access :raise BadObject: if object could not be located""" db_path = self.db_path(self.object_path(bin_to_hex(sha))) try: return file_contents_ro_filepath(db_path, flags=self._fd_open_flags) except OSError, e: if e.errno != ENOENT: # try again without noatime try: return file_contents_ro_filepath(db_path) except OSError: raise BadObject(sha) # didn't work because of our flag, don't try it again self._fd_open_flags = 0 else: raise BadObject(sha)
def _map_loose_object(self, sha): """ :return: memory map of that file to allow random read access :raise BadObject: if object could not be located""" db_path = self.db_path(self.object_path(bin_to_hex(sha))) try: return file_contents_ro_filepath(db_path, flags=self._fd_open_flags) except OSError,e: if e.errno != ENOENT: # try again without noatime try: return file_contents_ro_filepath(db_path) except OSError: raise BadObject(sha) # didn't work because of our flag, don't try it again self._fd_open_flags = 0 else: raise BadObject(sha)
def hexsha(self): """:return: 40 byte hex version of our 20 byte binary sha""" return bin_to_hex(self.binsha)
def hexsha(self): """:return: 40 byte hex version of our 20 byte binary sha""" # b2a_hex produces bytes return bin_to_hex(self.binsha).decode('ascii')
def info(self, binsha: bytes) -> OInfo: hexsha, typename, size = self._git.get_object_header(bin_to_hex(binsha)) return OInfo(hex_to_bin(hexsha), typename, size)
def __str__(self): """:return: string of our SHA1 as understood by all git commands""" return bin_to_hex(self.binsha)
def has_object(self, sha): try: self.readable_db_object_path(bin_to_hex(sha)) return True except BadObject: return False
def _assert_index_entries(self, entries, trees): index = IndexFile.from_tree(self.rorepo, *[self.rorepo.tree(bin_to_hex(t)) for t in trees]) assert entries assert len(index.entries) == len(entries) for entry in entries: assert (entry.path, entry.stage) in index.entries
def test_rev_parse(self): rev_parse = self.rorepo.rev_parse # try special case: This one failed at some point, make sure its fixed self.assertEqual(rev_parse("33ebe").hexsha, "33ebe7acec14b25c5f84f35a664803fcab2f7781") # start from reference num_resolved = 0 for ref_no, ref in enumerate(Reference.iter_items(self.rorepo)): path_tokens = ref.path.split("/") for pt in range(len(path_tokens)): path_section = '/'.join(path_tokens[-(pt + 1):]) try: obj = self._assert_rev_parse(path_section) self.assertEqual(obj.type, ref.object.type) num_resolved += 1 except (BadName, BadObject): print("failed on %s" % path_section) # is fine, in case we have something like 112, which belongs to remotes/rname/merge-requests/112 pass # END exception handling # END for each token if ref_no == 3 - 1: break # END for each reference assert num_resolved # it works with tags ! tag = self._assert_rev_parse('0.1.4') self.assertEqual(tag.type, 'tag') # try full sha directly ( including type conversion ) self.assertEqual(tag.object, rev_parse(tag.object.hexsha)) self._assert_rev_parse_types(tag.object.hexsha, tag.object) # multiple tree types result in the same tree: HEAD^{tree}^{tree}:CHANGES rev = '0.1.4^{tree}^{tree}' self.assertEqual(rev_parse(rev), tag.object.tree) self.assertEqual(rev_parse(rev + ':CHANGES'), tag.object.tree['CHANGES']) # try to get parents from first revision - it should fail as no such revision # exists first_rev = "33ebe7acec14b25c5f84f35a664803fcab2f7781" commit = rev_parse(first_rev) self.assertEqual(len(commit.parents), 0) self.assertEqual(commit.hexsha, first_rev) self.failUnlessRaises(BadName, rev_parse, first_rev + "~") self.failUnlessRaises(BadName, rev_parse, first_rev + "^") # short SHA1 commit2 = rev_parse(first_rev[:20]) self.assertEqual(commit2, commit) commit2 = rev_parse(first_rev[:5]) self.assertEqual(commit2, commit) # todo: dereference tag into a blob 0.1.7^{blob} - quite a special one # needs a tag which points to a blob # ref^0 returns commit being pointed to, same with ref~0, and ^{} tag = rev_parse('0.1.4') for token in (('~0', '^0', '^{}')): self.assertEqual(tag.object, rev_parse('0.1.4%s' % token)) # END handle multiple tokens # try partial parsing max_items = 40 for i, binsha in enumerate(self.rorepo.odb.sha_iter()): self.assertEqual(rev_parse(bin_to_hex(binsha)[:8 - (i % 2)].decode('ascii')).binsha, binsha) if i > max_items: # this is rather slow currently, as rev_parse returns an object # which requires accessing packs, it has some additional overhead break # END for each binsha in repo # missing closing brace commit^{tree self.failUnlessRaises(ValueError, rev_parse, '0.1.4^{tree') # missing starting brace self.failUnlessRaises(ValueError, rev_parse, '0.1.4^tree}') # REVLOG ####### head = self.rorepo.head # need to specify a ref when using the @ syntax self.failUnlessRaises(BadObject, rev_parse, "%s@{0}" % head.commit.hexsha) # uses HEAD.ref by default self.assertEqual(rev_parse('@{0}'), head.commit) if not head.is_detached: refspec = '%s@{0}' % head.ref.name self.assertEqual(rev_parse(refspec), head.ref.commit) # all additional specs work as well self.assertEqual(rev_parse(refspec + "^{tree}"), head.commit.tree) self.assertEqual(rev_parse(refspec + ":CHANGES").type, 'blob') # END operate on non-detached head # position doesn't exist self.failUnlessRaises(IndexError, rev_parse, '@{10000}') # currently, nothing more is supported self.failUnlessRaises(NotImplementedError, rev_parse, "@{1 week ago}") # the last position assert rev_parse('@{1}') != head.commit
def stream(self, sha): """For now, all lookup is done by git itself""" hexsha, typename, size, stream = self._git.stream_object_data(bin_to_hex(sha)) return OStream(hex_to_bin(hexsha), typename, size, stream)
def stream(self, binsha: bytes) -> OStream: """For now, all lookup is done by git itself""" hexsha, typename, size, stream = self._git.stream_object_data(bin_to_hex(binsha)) return OStream(hex_to_bin(hexsha), typename, size, stream)
def partial_to_complete_sha(self, partial_binsha, hex_len): """Simple adaptor to feed into our implementation""" return self.partial_to_complete_sha_hex(bin_to_hex(partial_binsha)[:hex_len])
def info(self, sha): hexsha, typename, size = self._git.get_object_header(bin_to_hex(sha)) return OInfo(hex_to_bin(hexsha), typename, size)
def test_rev_parse(self): rev_parse = self.rorepo.rev_parse # try special case: This one failed at some point, make sure its fixed self.assertEqual( rev_parse("33ebe").hexsha, "33ebe7acec14b25c5f84f35a664803fcab2f7781") # start from reference num_resolved = 0 for ref_no, ref in enumerate(Reference.iter_items(self.rorepo)): path_tokens = ref.path.split("/") for pt in range(len(path_tokens)): path_section = '/'.join(path_tokens[-(pt + 1):]) try: obj = self._assert_rev_parse(path_section) self.assertEqual(obj.type, ref.object.type) num_resolved += 1 except (BadName, BadObject): print("failed on %s" % path_section) # is fine, in case we have something like 112, which belongs to remotes/rname/merge-requests/112 pass # END exception handling # END for each token if ref_no == 3 - 1: break # END for each reference assert num_resolved # it works with tags ! tag = self._assert_rev_parse('0.1.4') self.assertEqual(tag.type, 'tag') # try full sha directly ( including type conversion ) self.assertEqual(tag.object, rev_parse(tag.object.hexsha)) self._assert_rev_parse_types(tag.object.hexsha, tag.object) # multiple tree types result in the same tree: HEAD^{tree}^{tree}:CHANGES rev = '0.1.4^{tree}^{tree}' self.assertEqual(rev_parse(rev), tag.object.tree) self.assertEqual(rev_parse(rev + ':CHANGES'), tag.object.tree['CHANGES']) # try to get parents from first revision - it should fail as no such revision # exists first_rev = "33ebe7acec14b25c5f84f35a664803fcab2f7781" commit = rev_parse(first_rev) self.assertEqual(len(commit.parents), 0) self.assertEqual(commit.hexsha, first_rev) self.failUnlessRaises(BadName, rev_parse, first_rev + "~") self.failUnlessRaises(BadName, rev_parse, first_rev + "^") # short SHA1 commit2 = rev_parse(first_rev[:20]) self.assertEqual(commit2, commit) commit2 = rev_parse(first_rev[:5]) self.assertEqual(commit2, commit) # todo: dereference tag into a blob 0.1.7^{blob} - quite a special one # needs a tag which points to a blob # ref^0 returns commit being pointed to, same with ref~0, and ^{} tag = rev_parse('0.1.4') for token in (('~0', '^0', '^{}')): self.assertEqual(tag.object, rev_parse('0.1.4%s' % token)) # END handle multiple tokens # try partial parsing max_items = 40 for i, binsha in enumerate(self.rorepo.odb.sha_iter()): self.assertEqual( rev_parse(bin_to_hex(binsha)[:8 - (i % 2)].decode('ascii')).binsha, binsha) if i > max_items: # this is rather slow currently, as rev_parse returns an object # which requires accessing packs, it has some additional overhead break # END for each binsha in repo # missing closing brace commit^{tree self.failUnlessRaises(ValueError, rev_parse, '0.1.4^{tree') # missing starting brace self.failUnlessRaises(ValueError, rev_parse, '0.1.4^tree}') # REVLOG ####### head = self.rorepo.head # need to specify a ref when using the @ syntax self.failUnlessRaises(BadObject, rev_parse, "%s@{0}" % head.commit.hexsha) # uses HEAD.ref by default self.assertEqual(rev_parse('@{0}'), head.commit) if not head.is_detached: refspec = '%s@{0}' % head.ref.name self.assertEqual(rev_parse(refspec), head.ref.commit) # all additional specs work as well self.assertEqual(rev_parse(refspec + "^{tree}"), head.commit.tree) self.assertEqual(rev_parse(refspec + ":CHANGES").type, 'blob') # END operate on non-detached head # position doesn't exist self.failUnlessRaises(IndexError, rev_parse, '@{10000}') # currently, nothing more is supported self.failUnlessRaises(NotImplementedError, rev_parse, "@{1 week ago}") # the last position assert rev_parse('@{1}') != head.commit
def test_large_data_streaming(self, rwrepo): # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream # It should be shared if possible ldb = LooseObjectDB(osp.join(rwrepo.git_dir, 'objects')) for randomize in range(2): desc = (randomize and 'random ') or '' print("Creating %s data ..." % desc, file=sys.stderr) st = time() size, stream = make_memory_file(self.large_data_size_bytes, randomize) elapsed = time() - st print("Done (in %f s)" % elapsed, file=sys.stderr) # writing - due to the compression it will seem faster than it is st = time() binsha = ldb.store(IStream('blob', size, stream)).binsha elapsed_add = time() - st assert ldb.has_object(binsha) db_file = ldb.readable_db_object_path(bin_to_hex(binsha)) fsize_kib = osp.getsize(db_file) / 1000 size_kib = size / 1000 msg = "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" msg %= (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) print(msg, file=sys.stderr) # reading all at once st = time() ostream = ldb.stream(binsha) shadata = ostream.read() elapsed_readall = time() - st stream.seek(0) assert shadata == stream.getvalue() msg = "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" msg %= (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) print(msg, file=sys.stderr) # reading in chunks of 1 MiB cs = 512 * 1000 chunks = [] st = time() ostream = ldb.stream(binsha) while True: data = ostream.read(cs) chunks.append(data) if len(data) < cs: break # END read in chunks elapsed_readchunks = time() - st stream.seek(0) assert b''.join(chunks) == stream.getvalue() cs_kib = cs / 1000 print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks), file=sys.stderr) # del db file so git has something to do ostream = None import gc gc.collect() os.remove(db_file) # VS. CGIT ########## # CGIT ! Can using the cgit programs be faster ? proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) # write file - pump everything in at once to be a fast as possible data = stream.getvalue() # cache it st = time() proc.stdin.write(data) proc.stdin.close() gitsha = proc.stdout.read().strip() proc.wait() gelapsed_add = time() - st del(data) assert gitsha == bin_to_hex(binsha) # we do it the same way, right ? # as its the same sha, we reuse our path fsize_kib = osp.getsize(db_file) / 1000 msg = "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" msg %= (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) print(msg, file=sys.stderr) # compare ... print("Git-Python is %f %% faster than git when adding big %s files" % (100.0 - (elapsed_add / gelapsed_add) * 100, desc), file=sys.stderr) # read all st = time() hexsha, typename, size, data = rwrepo.git.get_object_data(gitsha) # @UnusedVariable gelapsed_readall = time() - st print("Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall), file=sys.stderr) # compare print("Git-Python is %f %% faster than git when reading big %sfiles" % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc), file=sys.stderr) # read chunks st = time() hexsha, typename, size, stream = rwrepo.git.stream_object_data(gitsha) # @UnusedVariable while True: data = stream.read(cs) if len(data) < cs: break # END read stream gelapsed_readchunks = time() - st msg = "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" msg %= (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) print(msg, file=sys.stderr) # compare print("Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc), file=sys.stderr)
def test_large_data_streaming(self, rwrepo): # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream # It should be shared if possible ldb = LooseObjectDB(osp.join(rwrepo.git_dir, 'objects')) for randomize in range(2): desc = (randomize and 'random ') or '' print("Creating %s data ..." % desc, file=sys.stderr) st = time() size, stream = make_memory_file(self.large_data_size_bytes, randomize) elapsed = time() - st print("Done (in %f s)" % elapsed, file=sys.stderr) # writing - due to the compression it will seem faster than it is st = time() binsha = ldb.store(IStream('blob', size, stream)).binsha elapsed_add = time() - st assert ldb.has_object(binsha) db_file = ldb.readable_db_object_path(bin_to_hex(binsha)) fsize_kib = osp.getsize(db_file) / 1000 size_kib = size / 1000 msg = "Added %i KiB (filesize = %i KiB) of %s data to loose odb in %f s ( %f Write KiB / s)" msg %= (size_kib, fsize_kib, desc, elapsed_add, size_kib / elapsed_add) print(msg, file=sys.stderr) # reading all at once st = time() ostream = ldb.stream(binsha) shadata = ostream.read() elapsed_readall = time() - st stream.seek(0) assert shadata == stream.getvalue() msg = "Read %i KiB of %s data at once from loose odb in %f s ( %f Read KiB / s)" msg %= (size_kib, desc, elapsed_readall, size_kib / elapsed_readall) print(msg, file=sys.stderr) # reading in chunks of 1 MiB cs = 512 * 1000 chunks = list() st = time() ostream = ldb.stream(binsha) while True: data = ostream.read(cs) chunks.append(data) if len(data) < cs: break # END read in chunks elapsed_readchunks = time() - st stream.seek(0) assert b''.join(chunks) == stream.getvalue() cs_kib = cs / 1000 print("Read %i KiB of %s data in %i KiB chunks from loose odb in %f s ( %f Read KiB / s)" % (size_kib, desc, cs_kib, elapsed_readchunks, size_kib / elapsed_readchunks), file=sys.stderr) # del db file so git has something to do ostream = None import gc gc.collect() os.remove(db_file) # VS. CGIT ########## # CGIT ! Can using the cgit programs be faster ? proc = rwrepo.git.hash_object('-w', '--stdin', as_process=True, istream=subprocess.PIPE) # write file - pump everything in at once to be a fast as possible data = stream.getvalue() # cache it st = time() proc.stdin.write(data) proc.stdin.close() gitsha = proc.stdout.read().strip() proc.wait() gelapsed_add = time() - st del(data) assert gitsha == bin_to_hex(binsha) # we do it the same way, right ? # as its the same sha, we reuse our path fsize_kib = osp.getsize(db_file) / 1000 msg = "Added %i KiB (filesize = %i KiB) of %s data to using git-hash-object in %f s ( %f Write KiB / s)" msg %= (size_kib, fsize_kib, desc, gelapsed_add, size_kib / gelapsed_add) print(msg, file=sys.stderr) # compare ... print("Git-Python is %f %% faster than git when adding big %s files" % (100.0 - (elapsed_add / gelapsed_add) * 100, desc), file=sys.stderr) # read all st = time() hexsha, typename, size, data = rwrepo.git.get_object_data(gitsha) # @UnusedVariable gelapsed_readall = time() - st print("Read %i KiB of %s data at once using git-cat-file in %f s ( %f Read KiB / s)" % (size_kib, desc, gelapsed_readall, size_kib / gelapsed_readall), file=sys.stderr) # compare print("Git-Python is %f %% faster than git when reading big %sfiles" % (100.0 - (elapsed_readall / gelapsed_readall) * 100, desc), file=sys.stderr) # read chunks st = time() hexsha, typename, size, stream = rwrepo.git.stream_object_data(gitsha) # @UnusedVariable while True: data = stream.read(cs) if len(data) < cs: break # END read stream gelapsed_readchunks = time() - st msg = "Read %i KiB of %s data in %i KiB chunks from git-cat-file in %f s ( %f Read KiB / s)" msg %= (size_kib, desc, cs_kib, gelapsed_readchunks, size_kib / gelapsed_readchunks) print(msg, file=sys.stderr) # compare print("Git-Python is %f %% faster than git when reading big %s files in chunks" % (100.0 - (elapsed_readchunks / gelapsed_readchunks) * 100, desc), file=sys.stderr)