def create(self): """Creates new database""" f = self.file head = util.read_meta(f,"hDatabases") #Generate ID ID = util.read_meta(f,"cDatabases")+1 util.write_meta(f,"cDatabases",ID) db = self.find_last() bdata = struct.pack("=I32sQQ",ID,self.name,db.addr,0) f.seek(0,2) self.addr = f.tell() #Write data f.write(bdata) #Bind tail util.write_meta(f,"tDatabases",self.addr) #Bind previous node db.next = self.addr f.seek(db.addr+44) bdata = struct.pack("=I",db.next) f.write(bdata) return Result(0,"Database created")
def find_last(self): """ Returns last database """ f = self.file head = util.read_meta(f,"tDatabases") db = Database(f) f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) return db
def print_databases(self): """Show all databases""" f = self.file head = util.read_meta(f,"hDatabases") db = Database(f,"_default") f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) print db while db.next != 0: f.seek(db.next) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) print db
def exists(self): """Checks for database existence""" f = self.file head = util.read_meta(f,"hDatabases") db = Database(f,"_default") f.seek(head) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) while db.next != 0: f.seek(db.next) db.addr = f.tell() bdata = f.read(52) db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata) db.name = util.trim(db.name) if db.name == self.name: return True return False
_ = [os.remove(f) for f in files] print('finish filtering') #%% ## copy and clear useless data data_path = os.path.join( 'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls', 'xml', '002') dest_path = os.path.join( 'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls', 'process_search_docs', 'data', '002') copy = False dump = True ids, meta = read_meta('staff_reports_meta.csv') #%% ## copy xml to data folder if copy: copy_files(data_path, dest_path) #%% ## keep only staff reports xmls = os.listdir(dest_path) xmls = [f for f in xmls if get_ids(f)[1] in ids] #%% ## dump xmls to pickle if dump: doc_list = list()
from util import read_meta, read_vertex import sys dataset = 'web-Stanford.txt' with open(dataset, 'r') as f: n_vertex, _ = read_meta(f) path = sys.argv[1] n = int(sys.argv[2]) interval = n / 100 vertex = set() count = 0 for i in range(n): if i % interval == 0: print("{}:\t {} / {}\t{}".format(i, len(vertex), n_vertex, len(vertex) / float(n_vertex))) fname = path + "/sample_{}.txt".format(i) with open(fname, 'r') as f: n_sampled, _ = read_meta(f) # only show largest CC if n_sampled < n_vertex * 0.1: count += 1 print(count) continue v_sampled, _ = read_vertex(f, n_sampled) vertex = vertex.union(set(v_sampled))
def main(args): # print interval if args.method == "bfs": interval = 1 else: interval = args.samples / 10 if interval <= 0: interval = 1 output = "{}_{}_{}.txt".format(args.output, args.shuffle and "shuffle" or "no_shuffle", args.percent) print(output) d = args.damping_factor # load full graph or meta information about full graph if args.method == "uniform": from sampler import uniform_sampling from util import load_full_graph from scipy.sparse.csr import csr_matrix as csr n_vertex, full_ind, full_val = load_full_graph(args.dataset, sort_indices=False) full_row, full_col = zip(*full_ind) full_M = csr((full_val, (full_row, full_col)), shape=(n_vertex, n_vertex)) elif args.method == "edge": from util import read_meta, read_indices from sampler import edge_sampling with open(args.dataset) as f: n_vertex, n_edge = read_meta(f) full_indices = read_indices(f, n_edge) full_indices = np.array(full_indices) else: from util import read_meta, load_sampled_graph # read # of total vertex with open(args.dataset, 'r') as f: n_vertex, _ = read_meta(f) percent = int(args.percent) / 100.0 # global page rank value array global_pr = np.array([[1.0 / n_vertex]] * n_vertex, dtype=np.float32) # define dataflow graph with tf.device('/device:GPU:0'): n_sampled = tf.placeholder(tf.int32) local_pr = tf.placeholder(tf.float32) indices = tf.placeholder(tf.int64) values = tf.placeholder(tf.float32) dense_shape = tf.placeholder(tf.int64) m = tf.SparseTensor(indices=indices, values=values, dense_shape=dense_shape) new_pr = d * tf.sparse_tensor_dense_matmul(m, local_pr) \ + tf.reduce_sum(local_pr) * (1 - d) / tf.to_float(n_sampled) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) # sample traversal order sample_order = range(args.samples) for epoch in range(args.epochs): print("epoch {} / {}".format(epoch, args.epochs)) pr_buffer = global_pr.copy() # shuffle samples if needed if args.shuffle: random.shuffle(sample_order) count = 0 for sample_idx in sample_order: print("epoch {} sample {} {}".format(epoch, count, sample_idx)) # load or on-the-fly sample one subgraph if args.method == "uniform": ver, ind, val = uniform_sampling(full_M, n_vertex, percent, sort=False) elif args.method == "edge": ver, ind, val = edge_sampling(full_indices, percent) else: ver, ind, val = load_sampled_graph( "samples{}/sample_{}.txt".format(args.percent, sample_idx)) nver = len(ver) # run one iteration pr_value = sess.run(new_pr, feed_dict={ local_pr: pr_buffer[ver], indices: np.array(ind, np.int64), values: val, dense_shape: np.array([nver, nver], np.int64), n_sampled: nver }) # scatter update local buffer pr_buffer[ver] = pr_value # write out current pr if args.method == "bfs" and count % interval == 0: dump(output, pr_buffer, epoch, count, sample_idx) print(np.sum(global_pr)) # explicitly remove reference to release memory del ver, ind, val count += 1 # endfor sample norm = np.abs(pr_buffer - global_pr) / global_pr norm = np.sort(np.reshape(norm, -1)) norm99 = norm[int(0.99 * n_vertex)] norm50 = norm[int(0.5 * n_vertex)] dump("{}_{}".format(output, epoch), pr_buffer, epoch, count, norm50, norm99) global_pr = pr_buffer print("norm: p50 {}, p99 {}".format(norm50, norm99)) if norm99 < 1e-4: break