def hash(self, filepath): tmp_filename = str(uuid.uuid4()) total_size = 0 with open(os.path.join("/tmp", tmp_filename), 'wb') as fw: with open(filepath, 'rb') as fr: # read file block by block buf = fr.read(self.BLOCKSIZE) buf_len = len(buf) total_size += buf_len while buf_len > 0: self.hash_algo.update(buf) fw.write(buf) buf = fr.read(self.BLOCKSIZE) buf_len = len(buf) total_size += buf_len # figure out the digest of the file digest = self.hash_algo.hexdigest() (directory, filename) = Helmspoint.digest_filepath(digest) # move the file obj_src = os.path.join("/tmp", tmp_filename) obj_dst = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) os.rename(obj_src, obj_dst) return digest, total_size
def write(self, tree): tree_json = json.dumps(tree).encode('UTF-8') digest = hashlib.sha256(tree_json).hexdigest() (directory, filename) = Helmspoint.digest_filepath(digest) # write it to object directory dst_path = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) with open(dst_path, 'wb') as fw: fw.write(tree_json) return digest
def write(self): stage_data = self.initial_data() self.digest = hashlib.sha256(stage_data).hexdigest() (directory, filename) = Helmspoint.digest_filepath(self.digest) # write it to object directory dst_path = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) with open(dst_path, 'wb') as fw: fw.write(stage_data) return self.digest
def write(self): dag_data = self.build() dag_json = json.dumps(dag_data).encode('UTF-8') self.digest = hashlib.sha256(dag_json).hexdigest() (directory, filename) = Helmspoint.digest_filepath(self.digest) dst_path = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) with open(dst_path, 'wb') as fw: fw.write(dag_json) return self.digest
def run(self, arg_map, data_digests): # get the dag dag_json = Dag.get(self.digest) # get the stage func_link = next(link for link in dag_json['links'] if link['name'] == 'func') # deserialize the func stage_func = Stage.get(func_link['hash']) # use every parent dag hash to look up data. parent_links = [link for link in dag_json['links'] if link['name'] == 'parent'] parent_dag_digests = map(lambda link: link['hash'], parent_links) print("arg_map %s" % arg_map) print("data_digests %s" % data_digests) print("parent_dag_digests %s" % parent_dag_digests) # build up data arguments to go into this stage arg_names = arg_map[dag_json['data']['name']] input_data = [] for parent_data_digest in parent_dag_digests: data_digest = data_digests[parent_data_digest] (directory, filepath) = Helmspoint.digest_filepath(data_digest) datapath = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filepath) with open(datapath, 'r') as f: raw_data = f.read() json_data = json.loads(raw_data) input_data.append(json_data) # run it print("running stage: %s" % dag_json['data']['name']) parents_mapping = dict(zip(arg_names, input_data)) output_data = stage_func(**parents_mapping) # hash data and write the data to disk pipe_result_path= os.path.join("datasource", "pipeline") os.makedirs(pipe_result_path, exist_ok = True) datapath = os.path.join(pipe_result_path, dag_json['data']['name']) with open(datapath, 'w') as f: json_data = json.dumps(output_data) f.write(json_data) (data_digest, data_size) = Blob().hash(datapath) print("----------") return data_digest
def write(self, commit): commit_json = json.dumps(commit).encode('UTF-8') digest = hashlib.sha256(commit_json).hexdigest() (directory, filename) = Helmspoint.digest_filepath(digest) # write it to object directory dst_path = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) with open(dst_path, 'wb') as fw: fw.write(commit_json) # write hash of new commit to ref/heads/master with open(os.path.join(Helmspoint.REPO_HEADS_PATH, "master"), 'w') as fw: fw.write(digest) return digest
def get(digest): (directory, filename) = Helmspoint.digest_filepath(digest) filepath = os.path.join(Helmspoint.REPO_OBJ_PATH, directory, filename) with open(filepath, 'rb') as f: raw = f.read() return cloudpickle.loads(raw)