Example #1
0
	def create(self):
		"""Creates new database"""

		f = self.file
		head = util.read_meta(f,"hDatabases")
		#Generate ID
		ID = util.read_meta(f,"cDatabases")+1
		util.write_meta(f,"cDatabases",ID)

		db = self.find_last()
		bdata = struct.pack("=I32sQQ",ID,self.name,db.addr,0)
		f.seek(0,2)
		self.addr = f.tell()

		#Write data 
		f.write(bdata)

		#Bind tail
		util.write_meta(f,"tDatabases",self.addr)

		#Bind previous node
		db.next = self.addr
		f.seek(db.addr+44)
		bdata = struct.pack("=I",db.next)
		f.write(bdata)

		return Result(0,"Database created")
Example #2
0
	def find_last(self):
		"""
			Returns last database
		"""
		f = self.file
		head = util.read_meta(f,"tDatabases")

		db = Database(f)
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)
		return db			
Example #3
0
	def print_databases(self):
		"""Show all databases"""
		f = self.file
		head = util.read_meta(f,"hDatabases")

		db = Database(f,"_default")
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)
		print db
		while db.next != 0:
			f.seek(db.next)
			db.addr = f.tell()
			bdata = f.read(52)
			db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
			db.name = util.trim(db.name)
			print db
Example #4
0
	def exists(self):
		"""Checks for database existence"""

		f = self.file
		head = util.read_meta(f,"hDatabases")
		

		db = Database(f,"_default")
		f.seek(head)
		db.addr = f.tell()
		bdata = f.read(52)
		db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
		db.name = util.trim(db.name)

		while db.next != 0:
			f.seek(db.next)
			db.addr = f.tell()
			bdata = f.read(52)
			db.id, db.name, db.prev, db.next = struct.unpack("=I32sQQ",bdata)
			db.name = util.trim(db.name)
			if db.name == self.name:
				return True
		return False
Example #5
0
    _ = [os.remove(f) for f in files]
    print('finish filtering')


#%%
## copy and clear useless data

data_path = os.path.join(
    'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls',
    'xml', '002')
dest_path = os.path.join(
    'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls',
    'process_search_docs', 'data', '002')
copy = False
dump = True
ids, meta = read_meta('staff_reports_meta.csv')

#%%
## copy xml to data folder
if copy:
    copy_files(data_path, dest_path)
#%%
## keep only staff reports
xmls = os.listdir(dest_path)
xmls = [f for f in xmls if get_ids(f)[1] in ids]

#%%
## dump xmls to pickle

if dump:
    doc_list = list()
from util import read_meta, read_vertex
import sys

dataset = 'web-Stanford.txt'

with open(dataset, 'r') as f:
    n_vertex, _ = read_meta(f)

path = sys.argv[1]
n = int(sys.argv[2])

interval = n / 100

vertex = set()

count = 0
for i in range(n):
    if i % interval == 0:
        print("{}:\t {} / {}\t{}".format(i, len(vertex), n_vertex,
                                         len(vertex) / float(n_vertex)))
    fname = path + "/sample_{}.txt".format(i)
    with open(fname, 'r') as f:
        n_sampled, _ = read_meta(f)
        # only show largest CC
        if n_sampled < n_vertex * 0.1:
            count += 1
            print(count)
            continue
        v_sampled, _ = read_vertex(f, n_sampled)
    vertex = vertex.union(set(v_sampled))
Example #7
0
def main(args):
    # print interval
    if args.method == "bfs":
        interval = 1
    else:
        interval = args.samples / 10
        if interval <= 0:
            interval = 1
    output = "{}_{}_{}.txt".format(args.output, args.shuffle and "shuffle"
                                   or "no_shuffle", args.percent)
    print(output)
    d = args.damping_factor

    # load full graph or meta information about full graph
    if args.method == "uniform":
        from sampler import uniform_sampling
        from util import load_full_graph
        from scipy.sparse.csr import csr_matrix as csr
        n_vertex, full_ind, full_val = load_full_graph(args.dataset,
                                                       sort_indices=False)
        full_row, full_col = zip(*full_ind)
        full_M = csr((full_val, (full_row, full_col)),
                     shape=(n_vertex, n_vertex))
    elif args.method == "edge":
        from util import read_meta, read_indices
        from sampler import edge_sampling
        with open(args.dataset) as f:
            n_vertex, n_edge = read_meta(f)
            full_indices = read_indices(f, n_edge)
            full_indices = np.array(full_indices)
    else:
        from util import read_meta, load_sampled_graph
        # read # of total vertex
        with open(args.dataset, 'r') as f:
            n_vertex, _ = read_meta(f)

    percent = int(args.percent) / 100.0

    # global page rank value array
    global_pr = np.array([[1.0 / n_vertex]] * n_vertex, dtype=np.float32)

    # define dataflow graph
    with tf.device('/device:GPU:0'):
        n_sampled = tf.placeholder(tf.int32)
        local_pr = tf.placeholder(tf.float32)
        indices = tf.placeholder(tf.int64)
        values = tf.placeholder(tf.float32)
        dense_shape = tf.placeholder(tf.int64)
        m = tf.SparseTensor(indices=indices,
                            values=values,
                            dense_shape=dense_shape)
        new_pr = d * tf.sparse_tensor_dense_matmul(m, local_pr) \
                + tf.reduce_sum(local_pr) * (1 - d) / tf.to_float(n_sampled)
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

    # sample traversal order
    sample_order = range(args.samples)

    for epoch in range(args.epochs):
        print("epoch {} / {}".format(epoch, args.epochs))

        pr_buffer = global_pr.copy()

        # shuffle samples if needed
        if args.shuffle:
            random.shuffle(sample_order)

        count = 0
        for sample_idx in sample_order:
            print("epoch {} sample {} {}".format(epoch, count, sample_idx))

            # load or on-the-fly sample one subgraph
            if args.method == "uniform":
                ver, ind, val = uniform_sampling(full_M,
                                                 n_vertex,
                                                 percent,
                                                 sort=False)
            elif args.method == "edge":
                ver, ind, val = edge_sampling(full_indices, percent)
            else:
                ver, ind, val = load_sampled_graph(
                    "samples{}/sample_{}.txt".format(args.percent, sample_idx))

            nver = len(ver)
            # run one iteration
            pr_value = sess.run(new_pr,
                                feed_dict={
                                    local_pr: pr_buffer[ver],
                                    indices: np.array(ind, np.int64),
                                    values: val,
                                    dense_shape: np.array([nver, nver],
                                                          np.int64),
                                    n_sampled: nver
                                })

            # scatter update local buffer
            pr_buffer[ver] = pr_value

            # write out current pr
            if args.method == "bfs" and count % interval == 0:
                dump(output, pr_buffer, epoch, count, sample_idx)

            print(np.sum(global_pr))

            # explicitly remove reference to release memory
            del ver, ind, val

            count += 1

        # endfor sample

        norm = np.abs(pr_buffer - global_pr) / global_pr
        norm = np.sort(np.reshape(norm, -1))
        norm99 = norm[int(0.99 * n_vertex)]
        norm50 = norm[int(0.5 * n_vertex)]
        dump("{}_{}".format(output, epoch), pr_buffer, epoch, count, norm50,
             norm99)
        global_pr = pr_buffer
        print("norm: p50 {}, p99 {}".format(norm50, norm99))
        if norm99 < 1e-4:
            break