def test_tree_merge(self): logging.debug('Testing tree merge') reset_storage() setup_test_dirs() tb_A = tree_blob() tb_B = tree_blob() tb_C = tree_blob() #common ancestor tb_A.create_tree(key, '../resource/test_dir_1/root') tb_B.create_tree(key, '../resource/test_dir_2/root') tb_C.create_tree(key, '../resource/test_dir_3/root') tb_merge = tree_blob.merge(tb_A, tb_B, tb_C) print tb_merge
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info( 'working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) #@TODO: Write to a temp directory first and then cut to the working directory? Would ensure user has very little possibility to see on partial files. cb = commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to restore:\n%s' % (str(tb))) tb.write_folders(working_directory) for path, node in tb.walk(): if node.node_type != 'file': continue fb = file_blob() fb.load(key, storage_directory, node.hash_hex) full_file_path = os.path.join(working_directory, path) #full_file_path = working_directory + path + '/' + file_name f = open(full_file_path, 'wb') f.write(fb.apply_delta(key, storage_directory))
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info('working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) #@TODO: Write to a temp directory first and then cut to the working directory? Would ensure user has very little possibility to see on partial files. cb=commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to restore:\n%s'%(str(tb))) tb.write_folders(working_directory) for path, node in tb.walk(): if node.node_type != 'file': continue fb = file_blob() fb.load(key, storage_directory, node.hash_hex) full_file_path = os.path.join(working_directory, path) #full_file_path = working_directory + path + '/' + file_name f=open(full_file_path,'wb'); f.write(fb.apply_delta(key, storage_directory))
def commit_directory(self, key, working_directory, storage_directory, user_name, commit_msg, parent_commit_hash=None, other_parent_commit_hash=None): """Traverse working directory and store all file blobs""" logging.info('working_directory: %s, storage_directory: %s, user_name: %s, commit_msg: %s, parent_commit_hash: %s, other_parent_commit_hash: %s', working_directory, storage_directory, user_name, commit_msg, parent_commit_hash, other_parent_commit_hash) file_list, mod_times, last_commit_hash = local_blob_manager.read_commit_meta(working_directory) if parent_commit_hash==None and last_commit_hash!=None: parent_commit_hash = last_commit_hash #create and store tree blob tb = tree_blob() #TODO: fix this up so it diffs with parent tree blobs tb.create_tree(key, working_directory) #tb_tree_text = tb.create_tree_text(key, working_directory) #tb.build_tree(key, storage_directory) tb_tree_text_bytes = bytearray(tb.serialize_tree(),'utf-8') tb.compute_delta(key, tb_tree_text_bytes, None) tb.storage_directory = storage_directory print str(tb) tb.store(key, storage_directory) tb_hash = tb.my_hash #create and store commit blob cb = commit_blob() cb.create(key, user_name, commit_msg, tb_hash, parent_commit_hash, other_parent_commit_hash) cb.display() cb.store(key, storage_directory) commit_hash = cb.my_hash self.store_file_blobs(key, commit_hash, parent_commit_hash, storage_directory, working_directory) local_blob_manager.write_commit_meta(working_directory, commit_hash) return commit_hash
def commit_directory(self, key, working_directory, storage_directory, user_name, commit_msg, parent_commit_hash=None, other_parent_commit_hash=None): """Traverse working directory and store all file blobs""" logging.info('working_directory: %s, storage_directory: %s, user_name: %s, commit_msg: %s, parent_commit_hash: %s, other_parent_commit_hash: %s', working_directory, storage_directory, user_name, commit_msg, parent_commit_hash, other_parent_commit_hash) #create and store tree blob tb = tree_blob() tb_tree_text = tb.create_tree_text(key, working_directory) tb.compute_delta(key,tb_tree_text,None) tb.storage_directory = storage_directory tb.display() tb.store(key, storage_directory) tb_hash = tb.my_hash #create and store commit blob cb = commit_blob() cb.create(key, user_name, commit_msg, tb_hash, parent_commit_hash, other_parent_commit_hash) cb.display() cb.store(key, storage_directory) commit_hash = cb.my_hash self.store_file_blobs(key, commit_hash, parent_commit_hash, storage_directory, working_directory) return commit_hash
def commit_directory(self, key, working_directory, storage_directory, user_name, commit_msg, parent_commit_hash=None, other_parent_commit_hash=None): """Traverse working directory and store all file blobs""" logging.info( 'working_directory: %s, storage_directory: %s, user_name: %s, commit_msg: %s, parent_commit_hash: %s, other_parent_commit_hash: %s', working_directory, storage_directory, user_name, commit_msg, parent_commit_hash, other_parent_commit_hash) #create and store tree blob tb = tree_blob() tb_tree_text = tb.create_tree_text(key, working_directory) tb.compute_delta(key, tb_tree_text, None) tb.storage_directory = storage_directory tb.display() tb.store(key, storage_directory) tb_hash = tb.my_hash #create and store commit blob cb = commit_blob() cb.create(key, user_name, commit_msg, tb_hash, parent_commit_hash, other_parent_commit_hash) cb.display() cb.store(key, storage_directory) commit_hash = cb.my_hash self.store_file_blobs(key, commit_hash, parent_commit_hash, storage_directory, working_directory) return commit_hash
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info( 'working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) cb = commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.display() file_listing = tb.write_directory_structure(key, storage_directory, working_directory) #restore files for (path, file_name, hash_hex, file_size) in file_listing: fb = file_blob() fb.load(key, storage_directory, hash_hex) #full_file_path = os.path.join(working_directory_path, path, file_name) full_file_path = working_directory + path + '/' + file_name f = open(full_file_path, 'wb') f.write(fb.apply_delta(key, storage_directory))
def blobs_to_restore_blob(self, key, storage_directory, file_name, parent_tree=True): """ Returns a list file names of any blobs needed to restore a blob. If an empty list is returned then it should be possible to restore the given blob from local files. Will likely need to be called repeatedly on a commit because needed blobs also have dependent blobs which wont be known until the needed blobs are obtained. """ if (not os.path.exists(os.path.join(storage_directory, file_name))): return [file_name] #Check if this is a commit blob if file_name[0] == '_': #Get the tree. Parent commits are not needed to restore a commit, or its files. cb = commit_blob() cb.load(key, storage_directory, file_name[1:]) return self.blobs_to_restore_blob(key, storage_directory, cb.tree_hash) fb = file_blob() fb.load(key, storage_directory, file_name) #check if this is a tree blob if fb.blob_type == 'tree': tb = tree_blob() tb.load(key, storage_directory, file_name) #TODO: can this be casted? if not tb.parent_hash == '': #check for all parents of tree needed_tree_parent_hash = self.blobs_to_restore_blob( key, storage_directory, tb.parent_hash, False) if not (needed_tree_parent_hash == None): return needed_tree_parent_hash if not parent_tree: #only traverse tree structure of parent tree return None #check for all files in tree structure file_hashes = tb.file_hashes(key, storage_directory) #(unused_path, unused_file_name, hash_hex, unused_file_size) = tb.write_directory_structure(key, storage_directory, None, False) needed_files = [] for h in file_hashes: temp_hash = self.blobs_to_restore_blob(key, storage_directory, h) if not temp_hash == None: needed_files.extend(temp_hash) if needed_files == []: return None return needed_files #fb is a file blob if fb.parent_hash == '': return None else: return self.blobs_to_restore_blob(key, storage_directory, fb.parent_hash)
def blobs_to_restore_blob(self, key, storage_directory, file_name, parent_tree = True): """ Returns a list file names of any blobs needed to restore a blob. If an empty list is returned then it should be possible to restore the given blob from local files. Will likely need to be called repeatedly on a commit because needed blobs also have dependent blobs which wont be known until the needed blobs are obtained. """ if (not os.path.exists(os.path.join(storage_directory,file_name))): return [file_name] #Check if this is a commit blob if file_name[0]=='_': #Get the tree. Parent commits are not needed to restore a commit, or its files. cb = commit_blob() cb.load(key, storage_directory, file_name[1:]) return self.blobs_to_restore_blob(key, storage_directory, cb.tree_hash) fb = file_blob() fb.load(key, storage_directory, file_name) #check if this is a tree blob if fb.blob_type=='tree': tb=tree_blob() tb.load(key, storage_directory, file_name) #TODO: can this be casted? if not tb.parent_hash=='': #check for all parents of tree needed_tree_parent_hash = self.blobs_to_restore_blob(key, storage_directory, tb.parent_hash, False) if not (needed_tree_parent_hash == None): return needed_tree_parent_hash if not parent_tree: #only traverse tree structure of parent tree return None #check for all files in tree structure file_hashes = tb.file_hashes(key, storage_directory) #(unused_path, unused_file_name, hash_hex, unused_file_size) = tb.write_directory_structure(key, storage_directory, None, False) needed_files = [] for h in file_hashes: temp_hash = self.blobs_to_restore_blob(key, storage_directory, h) if not temp_hash==None: needed_files.extend(temp_hash) if needed_files==[]: return None return needed_files #fb is a file blob if fb.parent_hash=='': return None else: return self.blobs_to_restore_blob(key, storage_directory, fb.parent_hash)
def commit_directory(self, key, working_directory, storage_directory, user_name, commit_msg, parent_commit_hash=None, other_parent_commit_hash=None): """Traverse working directory and store all file blobs""" logging.info( 'working_directory: %s, storage_directory: %s, user_name: %s, commit_msg: %s, parent_commit_hash: %s, other_parent_commit_hash: %s', working_directory, storage_directory, user_name, commit_msg, parent_commit_hash, other_parent_commit_hash) file_list, mod_times, last_commit_hash = local_blob_manager.read_commit_meta( working_directory) if parent_commit_hash == None and last_commit_hash != None: parent_commit_hash = last_commit_hash #create and store tree blob tb = tree_blob() #TODO: fix this up so it diffs with parent tree blobs tb.create_tree(key, working_directory) #tb_tree_text = tb.create_tree_text(key, working_directory) #tb.build_tree(key, storage_directory) tb_tree_text_bytes = bytearray(tb.serialize_tree(), 'utf-8') tb.compute_delta(key, tb_tree_text_bytes, None) tb.storage_directory = storage_directory print str(tb) tb.store(key, storage_directory) tb_hash = tb.my_hash #create and store commit blob cb = commit_blob() cb.create(key, user_name, commit_msg, tb_hash, parent_commit_hash, other_parent_commit_hash) cb.display() cb.store(key, storage_directory) commit_hash = cb.my_hash self.store_file_blobs(key, commit_hash, parent_commit_hash, storage_directory, working_directory) local_blob_manager.write_commit_meta(working_directory, commit_hash) return commit_hash
def restore_directory(self, key, working_directory, storage_directory, commit_hash): ''' load a whole directory as an initial commit ''' logging.info('working_directory: %s, storage_directory: %s, commit_hash: %s', working_directory, storage_directory, commit_hash) cb=commit_blob() cb.load(key, storage_directory, commit_hash) #restore tree folder structure tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) tb.display() file_listing = tb.write_directory_structure(key, storage_directory, working_directory) #restore files for (path, file_name, hash_hex, file_size) in file_listing: fb = file_blob() fb.load(key, storage_directory, hash_hex) #full_file_path = os.path.join(working_directory_path, path, file_name) full_file_path = working_directory + path + '/' + file_name f=open(full_file_path,'wb'); f.write(fb.apply_delta(key, storage_directory))
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info( 'commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob( ) #it is okay to modify this tree blob. The one stored for the commit is already saved. tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to store: %s' % (tb)) #removed files with duplicate hashes in working directory or storage directory hash_set = set() for root, dirs, files in os.walk(storage_directory): for f in files: hash_set.add(f) logging.debug('storage directory hashes: %s' % (hash_set)) for path, node in tb.walk(): logging.debug('checking: %s' % (path)) if node.node_type != 'file': continue if node.hash_hex in hash_set: logging.debug('found hash match: %s' % (node.hash_hex)) tb.rm_node(path, 'file') else: hash_set.add(node.hash_hex) if parent_commit_hash == None: #this is an initial commit for path, node in tb.walk(): if node.node_type != 'file': continue full_path = os.path.join(working_directory, path) new_file = open(full_path, 'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) ptb.build_tree(key, storage_directory) logging.debug('performing differential commit using following trees') logging.debug('parent tree: %s' % (str(ptb))) logging.debug('child tree: %s' % (str(tb))) #find files with the same name in the same path, compute deltas, and store as file blob diffs for path, node in tb.walk(): if node.node_type != 'file': continue if not ptb.has_node(path, 'file'): continue p_node = ptb.get_node(path, 'file') logging.debug( 'Found files with matching paths and names. working: %s, parent: %s', node.hash_hex, p_node.hash_hex) full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path, 'rb') pfb = file_blob() pfb.load(key, storage_directory, p_node.hash_hex) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) tb.rm_node(path, 'file') #TODO: re-implement code commented below """ #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break """ #store all remaining files as initial versions for path, node in tb.walk(): if node.node_type != 'file': continue full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path, 'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info('commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() #it is okay to modify this tree blob. The one stored for the commit is already saved. tb.load(key, storage_directory, cb.tree_hash) tb.build_tree(key, storage_directory) logging.debug('tree to store: %s'%(tb)) #removed files with duplicate hashes in working directory or storage directory hash_set = set() for root, dirs, files in os.walk(storage_directory): for f in files: hash_set.add(f) logging.debug('storage directory hashes: %s'%(hash_set)) for path, node in tb.walk(): logging.debug('checking: %s'%(path)) if node.node_type != 'file': continue if node.hash_hex in hash_set: logging.debug('found hash match: %s'%(node.hash_hex)) tb.rm_node(path, 'file') else: hash_set.add(node.hash_hex) if parent_commit_hash==None: #this is an initial commit for path, node in tb.walk(): if node.node_type !='file': continue full_path = os.path.join(working_directory, path) new_file = open(full_path,'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) ptb.build_tree(key, storage_directory) logging.debug('performing differential commit using following trees') logging.debug('parent tree: %s' %(str(ptb))) logging.debug('child tree: %s' %(str(tb))) #find files with the same name in the same path, compute deltas, and store as file blob diffs for path, node in tb.walk(): if node.node_type != 'file': continue if not ptb.has_node(path, 'file'): continue p_node = ptb.get_node(path, 'file') logging.debug('Found files with matching paths and names. working: %s, parent: %s', node.hash_hex, p_node.hash_hex) full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path,'rb') pfb = file_blob() pfb.load(key, storage_directory, p_node.hash_hex) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) tb.rm_node(path, 'file') #TODO: re-implement code commented below """ #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break """ #store all remaining files as initial versions for path, node in tb.walk(): if node.node_type !='file': continue full_file_path = os.path.join(working_directory, path) new_file = open(full_file_path,'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info('commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) file_listing=tb.write_directory_structure(key, storage_directory, None, False) file_hashes=[] file_names=[] file_folders=[] file_sizes=[] for (path, file_name, hash_hex, file_size) in file_listing: file_hashes.append(hash_hex) file_names.append(file_name) file_folders.append(path) file_sizes.append(file_size) if parent_commit_hash==None: #this is an initial commit #store all remaining files as initial versions index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) file_listing=ptb.write_directory_structure(key, storage_directory, None, False) parent_file_hashes=[] parent_file_names=[] parent_file_folders=[] parent_file_sizes=[] for (path, file_name, hash_hex, file_size) in file_listing: parent_file_hashes.append(hash_hex) parent_file_names.append(file_name) parent_file_folders.append(path) parent_file_sizes.append(file_size) #Find file blob matches and similar file blobs #remove duplicate hashes in working directory index=-1 while True: index+=1 if index>=len(file_hashes)-1: break if file_hashes[index] in file_hashes[index+1:]: logging.debug('Found duplicate files within working directory. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 #remove duplicate hashes in working directory vs storage directory index=-1 while True: index+=1 if index>=len(file_hashes)-1: break if file_hashes[index] in parent_file_hashes: logging.debug('Found duplicate file already stored. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 #find files with the same name in the same path, compute deltas, and store as file blob diffs index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 if file_names[index]!= parent_file_names[parent_index]: continue if file_folders[index]!=parent_file_folders[parent_index]: continue #If this line is reached we found two files with the same name, path, but different hashes. #Compute the diff between these two files and store it. logging.debug('Found files with matching paths and names. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break #Look for similar files between working and parent and compute diffs on those index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 parent_index=-1 while (parent_index+1<len(parent_file_hashes)): #cycle through all files records in parent commit parent_index+=1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs(file_sizes[index]-parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s=difflib.SequenceMatcher(None,new_file_text,pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug('Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index-=1 break #store all remaining files as initial versions index=-1 while (index+1<len(file_hashes)): #cycle through all file records in working directory index+=1 full_file_path = working_directory + file_folders[index] + '/' + file_names[index] new_file = open(full_file_path,'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)
def store_file_blobs(self, key, commit_hash, parent_commit_hash, storage_directory, working_directory): logging.info( 'commit_hash: %s, parent_commit_hash: %s, storage_directory: %s, working_directory: %s', commit_hash, parent_commit_hash, storage_directory, working_directory) #chop the root folder off working_directory working_directory, tail = os.path.split(working_directory) #load current commit, tree, and file info cb = commit_blob() cb.load(key, storage_directory, commit_hash) tb = tree_blob() tb.load(key, storage_directory, cb.tree_hash) file_listing = tb.write_directory_structure(key, storage_directory, None, False) file_hashes = [] file_names = [] file_folders = [] file_sizes = [] for (path, file_name, hash_hex, file_size) in file_listing: file_hashes.append(hash_hex) file_names.append(file_name) file_folders.append(path) file_sizes.append(file_size) if parent_commit_hash == None: #this is an initial commit #store all remaining files as initial versions index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'r') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory) return #load parent commit, tree, and file info pcb = commit_blob() pcb.load(key, storage_directory, parent_commit_hash) ptb = tree_blob() ptb.load(key, storage_directory, pcb.tree_hash) file_listing = ptb.write_directory_structure(key, storage_directory, None, False) parent_file_hashes = [] parent_file_names = [] parent_file_folders = [] parent_file_sizes = [] for (path, file_name, hash_hex, file_size) in file_listing: parent_file_hashes.append(hash_hex) parent_file_names.append(file_name) parent_file_folders.append(path) parent_file_sizes.append(file_size) #Find file blob matches and similar file blobs #remove duplicate hashes in working directory index = -1 while True: index += 1 if index >= len(file_hashes) - 1: break if file_hashes[index] in file_hashes[index + 1:]: logging.debug( 'Found duplicate files within working directory. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 #remove duplicate hashes in working directory vs storage directory index = -1 while True: index += 1 if index >= len(file_hashes) - 1: break if file_hashes[index] in parent_file_hashes: logging.debug( 'Found duplicate file already stored. working: %s', file_hashes[index]) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 #find files with the same name in the same path, compute deltas, and store as file blob diffs index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 parent_index = -1 while (parent_index + 1 < len(parent_file_hashes) ): #cycle through all files records in parent commit parent_index += 1 if file_names[index] != parent_file_names[parent_index]: continue if file_folders[index] != parent_file_folders[parent_index]: continue #If this line is reached we found two files with the same name, path, but different hashes. #Compute the diff between these two files and store it. logging.debug( 'Found files with matching paths and names. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file.read(), pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 break #Look for similar files between working and parent and compute diffs on those index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 parent_index = -1 while (parent_index + 1 < len(parent_file_hashes) ): #cycle through all files records in parent commit parent_index += 1 #if file_names[index]!= parent_file_names[parent_index]: # break #must have similar file sizes percent_size_change = abs( file_sizes[index] - parent_file_sizes[index]) / file_sizes[index] if percent_size_change > 0.10: continue #must have similar byte sequences full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') new_file_text = new_file.read() pfb = file_blob() pfb.load(key, storage_directory, parent_file_hashes[parent_index]) pfb_text = pfb.apply_delta(key, storage_directory) s = difflib.SequenceMatcher(None, new_file_text, pfb_text) if s.real_quick_ratio() < 0.75: continue if s.quick_ratio() < 0.75: continue #If this line is reached the files are similar enough. Compute the diff and store. logging.debug( 'Found files with similar content. working: %s, parent: %s', file_hashes[index], parent_file_hashes[parent_index]) fb = file_blob() fb.compute_delta(key, new_file_text, pfb, storage_directory) fb.store(key, storage_directory) file_hashes.pop(index) file_names.pop(index) file_folders.pop(index) file_sizes.pop(index) index -= 1 break #store all remaining files as initial versions index = -1 while (index + 1 < len(file_hashes) ): #cycle through all file records in working directory index += 1 full_file_path = working_directory + file_folders[ index] + '/' + file_names[index] new_file = open(full_file_path, 'rb') fb = file_blob() fb.compute_delta(key, new_file.read()) fb.store(key, storage_directory)