def build_offline_time(): with PersistentDict("./local_push_time.json", format="json") as time_local_push: for data in NEW_DATA: print("processing", data) with PersistentDict(LOCAL_PUSH_INDEX + data + "_R.json", flag="r") as residual: time_local_push[data] = residual["time"] return
def load_local_push_index(data_name): ''' load local push index given data name ''' print("loading local push index", data_name) P = PersistentDict(LOCAL_PUSH_INDEX + data_name + "_P.json", flag="r", format="json") R = PersistentDict(LOCAL_PUSH_INDEX + data_name + "_R.json", flag="r", format="json") lpi = dict() lpi["P"] = P lpi["R"] = R return lpi
def local_push_simrank(A, indegrees, epsilon=0.01, delta=0.01, r_max=None, \ c=0.6, is_sync=False, data_name=None): ''' the local push algorithm for all-pairs simrank: push all residuals until all below epsilon A: adjacency matrix, csr sparse matrix indegrees: index of in-degree of each node, 1darray r_max: the maximum residual to-do: make it parallel ''' n = A.shape[0] m = A.nnz d = m / n if r_max is None: r_max = cal_rmax(d, epsilon, delta) print("threshold r", r_max) # the integer key for dict @jit def get_key(a1, a2): return a1 * n + a2 # estimate = SyncableDict(redis=conn, key=data_name+"_P") # residual = SyncableDict(redis=conn, key=data_name+"_R") # estimate.clear() # residual.clear() # estimate = dict() # residual = dict() # if data_name is not None: # off_line_file = shelve.open(LOCAL_PUSH_INDEX + data_name + ".shelve", protocol=4) with PersistentDict(LOCAL_PUSH_INDEX + data_name + "_P.json", flag="c", format="json") as estimate, \ PersistentDict(LOCAL_PUSH_INDEX + data_name + "_R.json", flag="c", format="json") as residual: estimate.clear() residual.clear() for i in range(0, n): residual[get_key(i, i)] = 1 Q = set([(i, i) for i in range(0, n)]) # use set to avoid dulplicate keys step = [0] current_sum = [0] # the sum of current estimates @jit def push(a, b): """ push the position a,b """ k_ab = get_key(a, b) step[0] += 1 r = residual.pop(k_ab, None) # delete current key # r = residual[a,b] # residual[a,b] = 0 # check whether key in the estimate estimate.setdefault(k_ab, 0) estimate[k_ab] += r current_sum[0] += r out_a = A.indices[A.indptr[a]:A.indptr[a + 1]] out_b = A.indices[A.indptr[b]:A.indptr[b + 1]] # print(len(out_a) * len(out_b)) # for oa, ob in product(out_a, out_b): # for oa,ob in dstack_product(out_a, out_b): is_singleton_node = a == b for (oa, ob) in product(out_a, out_b): if oa == ob: # don't push to singleton nodes continue if oa > ob: # if is_singleton_node: # residuals are from singletong nodes, only push to partial pairs continue else: oa, ob = ob, oa k_oaob = get_key(oa, ob) indeg_a = indegrees[oa] indeg_b = indegrees[ob] total_in = indeg_a * indeg_b if total_in > 0: inc = (c * r) / total_in total_in = indeg_a * indeg_b residual.setdefault(k_oaob, 0) residual[k_oaob] += inc # update residual value # if is_sync: # the sync method # next_set.add((oa,ob)) # else: if residual[k_oaob] > r_max: Q.add((oa, ob)) t1 = time.time() while len(Q) > 0: # check the memory # mem = psutil.virtual_memory() # mem_ratio = mem.available / mem.total i, j = Q.pop() # if step[0] % 1000 == 0: # print(current_sum[0], len(Q), \ # "i,j: ", (i,j), "poped priority:",\ # residual[i,j], "step:", step) push(i, j) t2 = time.time() # save off-line index to disk print("total ", t2 - t1, "seconds") residual['time'] = t2 - t1 print('origin sum', current_sum[0]) print("sum", current_sum[0] * 2 - n) # sync to redis # print("syncing P") # estimate.sync() # print("syncing R") # residual.sync() # if off_line_file is not None: # print("syncing to disk....") # off_line_file["P"] = estimate # off_line_file["R"] = residual # off_line_file["time"] = t2-t1 # off_line_file["size"] = sys.getsizeof(estimate) + sys.getsizeof(residual) # off_line_file.close() return True
metaStr.write(line) metaStr.write("\n") if metaStr.pos > self._size: raise se.MetadataOverflowError() # Clear out previous data - it is a volume, not a file metaStr.write('\0' * (self._size - metaStr.pos)) data = metaStr.getvalue() with fileUtils.DirectFile(self.metavol, "r+d") as f: f.seek(self._offset) f.write(data) LvBasedSDMetadata = lambda vg, lv: DictValidator( PersistentDict(LvMetadataRW(vg, lv, 0, SD_METADATA_SIZE)), BLOCK_SD_MD_FIELDS) TagBasedSDMetadata = lambda vg: DictValidator( PersistentDict(VGTagMetadataRW(vg)), BLOCK_SD_MD_FIELDS) def selectMetadata(sdUUID): mdProvider = LvBasedSDMetadata(sdUUID, sd.METADATA) if len(mdProvider) > 0: metadata = mdProvider else: metadata = TagBasedSDMetadata(sdUUID) return metadata
line = line.encode('utf-8') metadata[i] = line metadata = [i + '\n' for i in metadata] tmpFilePath = self._metafile + ".new" try: self._oop.writeLines(tmpFilePath, metadata) except IOError as e: if e.errno != errno.ESTALE: raise self._oop.writeLines(tmpFilePath, metadata) self._oop.os.rename(tmpFilePath, self._metafile) FileSDMetadata = lambda metafile: DictValidator( PersistentDict(FileMetadataRW(metafile)), FILE_SD_MD_FIELDS) class FileStorageDomain(sd.StorageDomain): def __init__(self, domainPath): # Using glob might look like the simplest thing to do but it isn't # If one of the mounts is stuck it'll cause the entire glob to fail # and you wouldn't be able to access any domain self.log.debug("Reading domain in path %s", domainPath) self.mountpoint = os.path.dirname(domainPath) self.remotePath = os.path.basename(self.mountpoint) self.metafile = os.path.join(domainPath, sd.DOMAIN_META_DATA, sd.METADATA) sdUUID = os.path.basename(domainPath) validateFileSystemFeatures(sdUUID, self.mountpoint)
metaStr.write("\n") if metaStr.pos > self._size: raise se.MetadataOverflowError() # Clear out previous data - it is a volume, not a file metaStr.write('\0' * (self._size - metaStr.pos)) data = metaStr.getvalue() with fileUtils.DirectFile(self.metavol, "r+d") as f: f.seek(self._offset) f.write(data) LvBasedSDMetadata = lambda vg, lv: DictValidator( PersistentDict(LvMetadataRW(vg, lv, 0, SD_METADATA_SIZE)), BLOCK_SD_MD_FIELDS) TagBasedSDMetadata = lambda vg: DictValidator( PersistentDict(VGTagMetadataRW(vg)), BLOCK_SD_MD_FIELDS) def selectMetadata(sdUUID): mdProvider = LvBasedSDMetadata(sdUUID, sd.METADATA) if len(mdProvider) > 0: metadata = mdProvider else: metadata = TagBasedSDMetadata(sdUUID) return metadata def metadataValidity(vg):