class GenDDR(object): ''' src is dynamic, and tag is dynamic ''' def __init__(self, db, dic, parent_conn, src_dic, ent_size, tgt_dic=None, mode='local'): ''' db: db name dic : the relation dict of the config parent_conn: server conn 1, rel data 2, src nodes: n 3, tgt nodes: m src_dic: src node config dict ent_size: the number of nodes of each entity tgt_dic: tgt node config dict if is None, then tgt_dic == src_dic, e.g. user -> user mode : local (default), dump edge locally : u-level, dump edge and node in parent process ''' self.parent_conn = parent_conn self.rel_dict = dic self.db_name = db self.src_dic = src_dic self.tgt_dic = tgt_dic self.mode = mode self.ent_size = ent_size self.db_handler = Store(db) self.db_handler.connect() # parameters of distributions # parameters of power law distribution: 1 - 2 self.convert = False if isinstance(dic['in']['lambd'], list): self.tau = dic['out']['lambd'] self.tau1 = dic['in']['lambd'][0] self.tau2 = dic['in']['lambd'][1] else: self.tau = dic['in']['lambd'] self.tau1 = dic['out']['lambd'][0] self.tau2 = dic['out']['lambd'][1] self.convert = True # some statistical info self.in_degree_nums = [] # [i] : the number of nodes with in degree i self.out_degree_nums = [] # [i] : the number of nodes with out degree i self.node_in_degree = [] # [i] : the in degree of node i self.node_out_degree = [] # [i] : the out degree of node i self.in_degree_nodes = [] # [i] : the nodes with in degree i self.out_degree_nodes = [] # [i] : the nodes with out degree i # generate index and nums info self.src_l_index = 0 self.src_r_index = 1 self.tgt_l_index = 0 self.tgt_r_index = 1 self.src_fake_cnt = 0 self.tgt_fake_cnt = 0 # compress degree list, to accelerate self.step = 1 # TODO: config info, compress step self.pwl_instance = PWL(0, self.tau, self.tau1, self.tau2) # for community self.mu_parameter = 0.4 # used to determine community belongs self.ncomunty = 10 # community id: 0, 1, ... self.comunty_nodes = [] # community: set of nodes # node id: start from 1 self.node_comunty_n_indegree = [[]] # [d1, ..., dn] self.node_comunty_n_outdegree = [[]] # [d1, ..., dn] self.node_comunty_ident = [set()] # the communities node i belongs to # self.node_comunty_in_ident = [-1] # the community node i belongs to, in degree # self.node_comunty_out_ident = [-1] # ditto, out degree def gen_node(self, one_gen): if one_gen['type'] == 'gaussian': mu = one_gen['mu'] si = one_gen['sigma'] gen_n = int(random.gauss(mu, si)) elif one_gen['type'] == 'uniform': mi = one_gen['min'] mx = one_gen['max'] gen_n = random.randint(mi, mx) return gen_n def start(self): ''' generate node and edges ''' try: rel_name = self.rel_dict['alias'] src_name = self.rel_dict['source'] tgt_name = self.rel_dict['target'] if self.tgt_dic == None: # e.g. user -> user # src_ent_gen = GenFE(self.src_dic, self.ent_size, self.db_name) stage_len = len(self.src_dic['stage']) stage_idx = 0 while self.src_r_index < self.src_dic['ceiling']: one_gen = self.src_dic['stage'][stage_idx] stage_idx = (stage_idx + 1) % stage_len gen_n = self.gen_node(one_gen) # src_ent_gen.start(gen_n) self.src_r_index += gen_n self.parent_conn.send((rel_name, src_name, macro.ADD_NODE, self.src_r_index)) p1 = [self.src_l_index, self.src_r_index] p2 = p1[:] self.gen_data(p1, p2) # res = self.gen_data(p1, p2) # if res[0] == 1: # self.src_r_index += (res[1] + res[2]) # self.src_l_index = self.src_r_index self.parent_conn.send((rel_name, src_name, macro.FINISH_NODE, self.src_r_index)) else: # e.g. user -> status # src_ent_gen = GenFE(self.src_dic, self.ent_size, self.db_name) # tgt_ent_gen = GenFE(self.tgt_dic, self.ent_size, self.db_name) src_stg_len = len(self.src_dic['stage']) tgt_stg_len = len(self.tgt_dic['stage']) src_stg_idx = 0 tgt_stg_idx = 0 # TODO: when should it stop while self.src_r_index < self.src_dic['ceiling'] or self.tgt_r_index < self.tgt_dic['ceiling']: one_src_gen = self.src_dic['stage'][src_stg_idx] one_tgt_gen = self.tgt_dic['stage'][tgt_stg_idx] src_stg_idx = (src_stg_idx + 1) % src_stg_len tgt_stg_idx = (src_stg_idx + 1) % tgt_stg_len gen_src_n = self.gen_node(one_src_gen) gen_tgt_n = self.gen_node(one_tgt_gen) # src_ent_gen.start(gen_src_n) # tgt_ent_gen.start(gen_tgt_n) self.src_r_index += gen_src_n self.tgt_r_index += gen_tgt_n # TODO: give index to parent process self.parent_conn.send((rel_name, src_name, macro.ADD_NODE, self.src_r_index)) self.parent_conn.send((rel_name, tgt_name, macro.ADD_NODE, self.tgt_r_index)) p1 = [self.src_l_index, self.src_r_index] p2 = [self.tgt_l_index, self.tgt_r_index] self.gen_data(p1, p2) # res = self.gen_data(p1, p2) # if res[0] == 1: # self.src_r_index += res[1] # self.src_l_index = src_r_index # self.tgt_r_index += res[2] # self.tgt_l_index = tgt_r_index self.parent_conn.send((rel_name, src_name, macro.FINISH_NODE, self.src_r_index)) self.parent_conn.send((rel_name, tgt_name, macro.FINISH_NODE, self.tgt_r_index)) except Exception as e: raise e finally: self.parent_conn.send((rel_name, '', macro.FINISH_EDGE, '')) self.db_handler.close() def build_vector(self, a, b, mx_dgre, dlt_dgre, dgre_nodes): ''' a, b : generate interval [a, b) mx_dgre : max degree dlt_dgre : delta_in_degree or delta_out_degree dgre_nodes: in_degree_nodes or out_degree_nodes return: (0/1, extra_nodes, rtn_vector) 0: do not need to add edges, 1: in contrast ''' extra_nodes = 0 minus = 0 for i in range(len(dlt_dgre)): if dlt_dgre[i] < 0: dlt_dgre[i] = 0 minus += 1 elif dlt_dgre[i] == 0: minus += 1 if minus > (len(dlt_dgre) / 3): # TODO: set parameter return (0, extra_nodes, []) # compute new node end_i = b new = [i for i in range(a, b)] random.shuffle(new) # compute compress degree list cmprs_dgre = [] ii = 0 for i in range(int(math.floor(mx_dgre / self.step))): s = 0 for j in range(i*self.step, i*self.step + self.step): s += dlt_dgre[j] cmprs_dgre.append(s) ii = i s = 0 for j in range(ii*self.step + self.step, mx_dgre): s += dlt_dgre[j] if s != 0: cmprs_dgre.append(s) # compute rtn vector rtn_vector = [] if len(dgre_nodes) == 0: index = 0 repeat = 1 for num in dlt_dgre: rtn_vector += [x for x in new[index:index+num] for j in range(repeat)] index += num repeat += 1 else: len_comp = len(cmprs_dgre) len_inot = len(dgre_nodes) delta_len = len_comp - len_inot # curcial part: adjust the compress degree list if delta_len <= 0: for i in range(len_comp-2, -1, -1): cmprs_dgre[i] += cmprs_dgre[i+1] else: if delta_len >= len_inot: for i in range(delta_len, len_comp): cmprs_dgre[i - delta_len] += cmprs_dgre[i] else: for i in range(len_comp - delta_len, len_comp): for j in range(i - delta_len, -1, (-delta_len)): cmprs_dgre[j] += cmprs_dgre[j + delta_len] # add content to rtn vector if delta_len <= 0: more = max(0, cmprs_dgre[0] - len(new)) new += [i for i in range(end_i, end_i + more)] extra_nodes += more end_i += more # select nodes from new nodes new_i = 0 rtn_vector += new[new_i:new_i + cmprs_dgre[0]] * self.step new_i += cmprs_dgre[0] # select nodes from existing nodes j = 0 for i in range(1, len(cmprs_dgre)): comp_size = cmprs_dgre[i] - len(dgre_nodes[j]) rtn_vector += dgre_nodes[j][:cmprs_dgre[i]] * self.step if comp_size > 0: more = max(comp_size - (end_i - new_i), 0) new += [i for i in range(end_i, end_i + more)] extra_nodes += more end_i += more rtn_vector += new[new_i:new_i + comp_size] * (self.step * (i+1)) new_i += comp_size j += 1 else: more = max(0, sum(cmprs_dgre[:delta_len]) - len(new)) new += [i for i in range(end_i, end_i + more)] extra_nodes += more end_i += more # new_i = 0 for i in range(delta_len): rtn_vector += new[new_i:new_i + cmprs_dgre[i]] * (self.step * (i + 1)) new_i += cmprs_dgre[i] # j = 0 for i in range(delta_len, len(cmprs_dgre)): comp_size = cmprs_dgre[i] - len(dgre_nodes[j]) rtn_vector += dgre_nodes[j][:cmprs_dgre[i]] * (self.step * delta_len) if comp_size > 0: more = max(comp_size - (end_i - new_i), 0) new += [i for i in range(end_i, end_i + more)] extra_nodes += more end_i += more rtn_vector += new[new_i:new_i + comp_size] * (self.step * (i + 1)) new_i += comp_size j += 1 return (1, extra_nodes, rtn_vector) def add_edge_info(self, src_id, tgt_id, out_i, in_i): sql = 'select * from %s where src_id = %s and tgt_id = %s' % (self.rel_dict['alias'], src_id, tgt_id) res = self.db_handler.query(sql) if len(res) == 0: # deal with sth about out degree for i in range(len(self.node_out_degree), out_i + 1): self.node_out_degree.append(0) self.node_out_degree[out_i] += 1 degreei = self.node_out_degree[out_i] ind = int(degreei / self.step) ind_ = int((degreei - 1) / self.step) if self.step == 1: ind -= 1 ind_ -= 1 for i in range(len(self.out_degree_nodes), ind + 1): self.out_degree_nodes.append([]) self.out_degree_nums.append(0) if degreei == 1: self.out_degree_nodes[0].append(out_i) self.out_degree_nums[0] += 1 elif degreei % self.step == 0: try: self.out_degree_nodes[ind_].remove(out_i) self.out_degree_nums[ind_] -= 1 except Exception as e: pass self.out_degree_nodes[ind].append(out_i) self.out_degree_nums[ind] += 1 # deal with sth about in degree for i in range(len(self.node_in_degree), in_i + 1): self.node_in_degree.append(0) self.node_in_degree[in_i] += 1 degreei = self.node_in_degree[in_i] ind = int(degreei / self.step) ind_ = int((degreei - 1) / self.step) if self.step == 1: ind -= 1 ind_ -= 1 for i in range(len(self.in_degree_nodes), ind + 1): self.in_degree_nodes.append([]) self.in_degree_nums.append(0) if degreei == 1: self.in_degree_nodes[0].append(in_i) self.in_degree_nums[0] += 1 elif degreei % self.step == 0: try: self.in_degree_nodes[ind_].remove(in_i) self.in_degree_nums[ind_] -= 1 except Exception as e: pass self.in_degree_nodes[ind].append(in_i) self.in_degree_nums[ind] += 1 return 1 else: return 0 def gen_data(self, src_nodes, tgt_nodes): ''' need to send msg to parent process content: (flag, extra) flag: 0 means do not need to add edges, while 1 means have added edges extra: when flag == 1, need to generate more nodes (extra) return : (flag, src_extra_nodes, tgt_extra_nodes) flag: 0, No generating edges 1, generate edges src.: generate more extra_out src nodes tgt.: ditto extra_in nodes ''' lens = src_nodes[1] - src_nodes[0] lent = tgt_nodes[1] - tgt_nodes[0] self.src_fake_cnt += lens self.tgt_fake_cnt += lent rel_data = [] if self.convert: # compute in vector self.pwl_instance.set_nodes(self.tgt_fake_cnt) self.pwl_instance.dtmn_max_degree() max_in_degree = self.pwl_instance.max_d1 cur_in_degree = pwl.get_node_count(self.tgt_fake_cnt, max_in_degree, self.tau) delta = max_in_degree - len(self.in_degree_nums) last_in_degree = self.in_degree_nums[:] + [0] * delta delta_in_degree = [cur_in_degree[i] - last_in_degree[i] for i in range(len(cur_in_degree))] (flag, extra_in, in_vector) = self.build_vector(tgt_nodes[0], tgt_nodes[1], max_in_degree, delta_in_degree, self.in_degree_nodes) if flag == 0: self.src_fake_cnt -= lens self.tgt_fake_cnt -= lent return # return (0, 0, 0) # compute out vector self.pwl_instance.set_nodes(self.src_fake_cnt) self.pwl_instance.dtmn_max_degree() maxk1, maxk2, cur_out_degree = self.pwl_instance.dtmn_max_degree_2() delta = len(cur_out_degree) - len(self.out_degree_nums) last_out_degree = self.out_degree_nums[:] + [0] * delta delta_out_degree = [cur_out_degree[i] - last_out_degree[i] for i in range(len(cur_out_degree))] (flag, extra_out, out_vector) = self.build_vector(src_nodes[0], src_nodes[1], maxk2, delta_out_degree, self.out_degree_nodes) if flag == 0: self.src_fake_cnt -= lens self.tgt_fake_cnt -= lent return # return (0, 0, 0) else: # compute out vector self.pwl_instance.set_nodes(self.src_fake_cnt) self.pwl_instance.dtmn_max_degree() max_out_degree = self.pwl_instance.max_d1 cur_out_degree = pwl.get_node_count(self.src_fake_cnt, max_out_degree, self.tau) delta = max_out_degree - len(self.out_degree_nums) last_out_degree = self.out_degree_nums[:] + [0] * delta delta_out_degree = [cur_out_degree[i] - last_out_degree[i] for i in range(len(cur_out_degree))] (flag, extra_out, out_vector) = self.build_vector(src_nodes[0], src_nodes[1], max_out_degree, delta_out_degree, self.out_degree_nodes) if flag == 0: self.src_fake_cnt -= lens self.tgt_fake_cnt -= lent return # return (0, 0, 0) # compute in vector self.pwl_instance.set_nodes(self.tgt_fake_cnt) self.pwl_instance.dtmn_max_degree() maxk1, maxk2, cur_in_degree = self.pwl_instance.dtmn_max_degree_2() delta = len(cur_in_degree) - len(self.in_degree_nums) last_in_degree = self.in_degree_nums[:] + [0] * delta delta_in_degree = [cur_in_degree[i] - last_in_degree[i] for i in range(len(cur_in_degree))] (flag, extra_in, in_vector) = self.build_vector(tgt_nodes[0], tgt_nodes[1], maxk2, delta_in_degree, self.in_degree_nodes) if flag == 0: self.src_fake_cnt -= lens self.tgt_fake_cnt -= lent return # return (0, 0, 0) # TODO: send extra data to parent process self.src_r_index += extra_out self.src_l_index = self.src_r_index self.parent_conn.send((self.rel_dict['alias'], self.rel_dict['source'], macro.ADD_NODE, self.src_r_index)) if self.tgt_dic != None: self.tgt_r_index += extra_in self.tgt_l_index = self.tgt_r_index self.parent_conn.send((self.rel_dict['alias'], self.rel_dict['target'], macro.ADD_NODE, self.tgt_r_index)) else: # TODO: correct ? self.src_r_index += extra_in self.src_l_index = self.src_r_index self.parent_conn.send((self.rel_dict['alias'], self.rel_dict['source'], macro.ADD_NODE, self.src_r_index)) # end ? # generate edges in_remain = [] out_remain = [] random.shuffle(in_vector) random.shuffle(out_vector) upper = min(len(in_vector), len(out_vector)) for i in range(upper): if in_vector[i] != out_vector[i]: # src_id = self.rel_dict['source'] + str(out_vector[i]) # tgt_id = self.rel_dict['target'] + str(in_vector[i]) src_id = out_vector[i] tgt_id = in_vector[i] if (src_id, tgt_id) in rel_data: continue flg = self.add_edge_info(src_id, tgt_id, out_vector[i], in_vector[i]) if flg == 1: rel_data.append((src_id, tgt_id)) else: out_remain.append(out_vector[i]) in_remain.append(in_vector[i]) while True: threshold = len(out_remain) remain_out = [] remain_in = [] random.shuffle(out_remain) random.shuffle(in_remain) for i in range(len(out_remain)): if out_remain[i] != in_remain[i]: # src_id = self.rel_dict['source'] + str(out_remain[i]) # tgt_id = self.rel_dict['target'] + str(in_remain[i]) src_id = out_remain[i] tgt_id = in_remain[i] if (src_id, tgt_id) in rel_data: continue flg = self.add_edge_info(src_id, tgt_id, out_remain[i], in_remain[i]) if flg == 1: rel_data.append((src_id, tgt_id)) else: remain_out.append(out_remain[i]) remain_in.append(in_remain[i]) if len(remain_in) == 0 or len(remain_in) == threshold: break out_remain = remain_out in_remain = remain_in # generate end if self.mode == 'local': # dump data locally # TODO: correct ? sql = 'insert into %s values (?, ?)' % (self.rel_dict['alias']) self.db_handler.insert_many(sql, rel_data) print('[Edge Generation]: Name %s, Amount %s' % (self.rel_dict['alias'], len(rel_data))) elif self.mode == 'u-level': # send info to parent process self.parent_conn.send((self.rel_dict['alias'], '', macro.ADD_EDGE, rel_data)) # return (1, extra_out, extra_in) def homogeneous_with_uniform_comunity(self, out_vector, in_vector): last_old_node = self.src_r_index - 1 new_nodes = max(max(out_vector), max(in_vector)) - last_old_node tmp_comunty_size = [[_, math.ceil(new_nodes / self.ncomunty)] for _ in range(self.ncomunty)] node_comunty_degree = [[0 for _ in range(self.ncomunty)] for _ in range(new_nodes)] # [:][0] means the community node_comunty_ident = [set() for _ in range(new_nodes)] self.node_comunty_n_indegree.extend(node_comunty_degree) self.node_comunty_n_outdegree.extend(node_comunty_degree) # self.node_comunty_in_ident.extend(node_comunty_ident) # self.node_comunty_out_ident.extend(node_comunty_ident) self.node_comunty_ident.extend(node_comunty_ident) random.shuffle(out_vector) res_rel_data = [] # [(a, b), ...] while len(out_vector) > 0 and len(in_vector) > 0: out_node = out_vector[0] if len(self.node_comunty_ident[out_node]) == 0: # homeless tmp_index = 0 match_old = False sort_tmp_comunty_size = sorted(tmp_comunty_size, key=lambda x: x[1], reverse=True) while tmp_index < self.ncomunty: recommend_cs = [] mx_val = sort_tmp_comunty_size[tmp_index][1] while tmp_index < self.ncomunty and sort_tmp_comunty_size[tmp_index][1] == mx_val: recommend_cs.append(sort_tmp_comunty_size[tmp_index][0]) tmp_index += 1 match = False for recom_c in recommend_cs: candidate_in_nodes = [node for node in in_vector if node != out_node and recom_c in self.node_comunty_ident[node]] cand_size = len(candidate_in_nodes) skip = False if cand_size > 0: for in_node in candidate_in_nodes: if (out_node, in_node) not in res_rel_data: # find a legal edge # scene: new_node -> old_node res_rel_data.append((out_node, in_node)) self.comunty_nodes[recom_c].add(out_node) self.node_comunty_ident[out_node].add(recom_c) self.node_comunty_n_indegree[in_node][recom_c] += 1 self.node_comunty_n_outdegree[out_node][recom_c] += 1 tmp_comunty_size[recom_c][1] -= 1 out_vector.remove(out_node) in_vector.remove(in_node) skip = True match = True match_old = True break if skip: break if match: break if not match_old: # scene: new_node -> new_node recom_c = sort_tmp_comunty_size[0][0] for in_node in in_vector: if in_node != out_node and (out_node, in_node) not in res_rel_data: res_rel_data.append((out_node, in_node)) self.comunty_nodes[recom_c].add(out_node) self.comunty_nodes[recom_c].add(in_node) self.node_comunty_ident[out_node].add(recom_c) self.node_comunty_ident[in_node].add(recom_c) self.node_comunty_n_indegree[in_node][recom_c] += 1 self.node_comunty_n_outdegree[out_node][recom_c] += 1 tmp_comunty_size[recom_c][1] -= 2 out_vector.remove(out_node) in_vector.remove(in_node) break else: # in a community, i.e. old_node # simple old_comunty = self.node_comunty_ident[out_node] sort_tmp_comunty_size = sorted(tmp_comunty_size, key=lambda x: x[1], reverse=True) mx_val = sort_tmp_comunty_size[0][1] recommend_cs = [] tmp_indx = 0 while tmp_index < self.ncomunty: if sort_tmp_comunty_size[tmp_index][1] == mx_val: recommend_cs.append(sort_tmp_comunty_size[tmp_index][0]) tmp_index += 1 recm_cs_set = set(recommend_cs) match = False inter_set = old_comunty.intersection(recm_cs_set) if len(inter_set) > 0: # able to add new node candidate_in_nodes = [node for node in in_vector if node != out_node and len(self.node_comunty_ident[node]) == 0] if len(candidate_in_nodes) > 0: for in_node in candidate_in_nodes: if (out_node, in_node) not in res_rel_data: res_rel_data.append((out_node, in_node)) match = True for recm_c in inter_set: self.comunty_nodes[recm_c].add(in_node) self.node_comunty_ident[in_node].add(recm_c) self.node_comunty_n_indegree[in_node][recm_c] += 1 self.node_comunty_n_outdegree[out_node][recm_c] += 1 tmp_comunty_size[recm_c][1] -= 1 out_vector.remove(out_node) in_vector.remove(in_node) break if not match: for in_node in in_vector: if in_node != out_node and (out_node, in_node) not in res_rel_data: res_rel_data.append((out_node, in_node)) in_node_comuntys = self.node_comunty_ident[in_node] for in_node_cmunty in in_node_comuntys: self.node_comunty_n_indegree[in_node][in_node_cmunty] += 1 for out_node_comunty in old_comunty: self.node_comunty_n_outdegree[out_node][out_node_comunty] += 1 # adjust for in_node in_degree = sum(self.node_comunty_n_indegree[in_node]) for c in range(self.ncomunty): if self.node_comunty_n_indegree[in_node][c] * 1. / in_degree > self.mu_parameter: # in_node belongs to community c self.node_comunty_ident[in_node].add(c) self.comunty_nodes[c].add(in_node) # adjust for out_node out_degree = sum(self.node_comunty_n_outdegree[out_node]) for c in range(self.ncomunty): if self.node_comunty_n_outdegree[out_node][c] * 1. / out_degree > self.mu_parameter: self.node_comunty_ident[out_node].add(c) self.comunty_nodes[c].add(out_node) out_vector.remove(out_node) in_vector.remove(in_node) break return res_rel_data def heterogeneous_with_uniform_comunity(self, out_vector, in_vector): # yi gou network, same size community pass
class Analyses(object): ''' Analyse data quality generated by our generator ''' def __init__(self, dic, db, in_, out, cutoff=6): ''' dic: the dict info of one type edge db : db file name, store the generated graph data ''' self.rel_dict = dic self.db_name = db self.db_handler = Store(db) self.db_handler.connect() self.cutoff = cutoff self.in_cnt = in_ self.out_cnt = out def statistic(self, array): ''' array: ''' array = sorted(array) n = len(array) an_dic = {} i = 0 size = 0 while i < n: j = 1 while i + 1 < n and array[i] == array[i + 1]: i += 1 j += 1 i += 1 size = max(size, j) if j in an_dic: an_dic[j] += 1 else: an_dic[j] = 1 size += 1 ans = [0] * size for i in range(size): if i in an_dic: ans[i] = an_dic[i] i = size - 1 while i > 0: if ans[i] < self.cutoff: i -= 1 continue else: break self.cutted_len = i return ans[1:self.cutted_len + 1] def get_data(self): try: rel_name = self.rel_dict['alias'] src_name = self.rel_dict['source'] tgt_name = self.rel_dict['target'] sql_cntsrc = 'select count(*) from %s' % (src_name) sql_cnttgt = 'select count(*) from %s' % (tgt_name) sql_all = 'select * from %s' % (rel_name) # self.out_cnt = self.db_handler.query(sql_cntsrc)[0][0] # self.in_cnt = self.db_handler.query(sql_cnttgt)[0][0] res_all = self.db_handler.query(sql_all) out_array = [t[0] for t in res_all] in_array = [t[1] for t in res_all] return (out_array, in_array) except Exception as e: raise e finally: self.db_handler.close() def analyse(self): try: out_array, in_array = self.get_data() if isinstance(self.rel_dict['in']['lambd'], list): tau = self.rel_dict['out']['lambd'] tau1 = self.rel_dict['in']['lambd'][0] tau2 = self.rel_dict['in']['lambd'][1] one = 'out' two = 'in' else: tau = self.rel_dict['in']['lambd'] tau1 = self.rel_dict['out']['lambd'][0] tau2 = self.rel_dict['out']['lambd'][1] one = 'in' two = 'out' pwl_ins = PWL(self.out_cnt, tau, tau1, tau2) # analyse one lambd node_count = eval('self.%s_cnt' % (one)) pwl_ins.set_nodes(node_count) o_list = self.statistic(eval(one + '_array')) pwl_ins.dtmn_max_degree() max_degree = pwl_ins.max_d1 e_list = pwl.get_node_count(node_count, max_degree, tau)[:self.cutted_len] if self.cutted_len > len(e_list): o_list = o_list[:len(e_list)] # improve effect of hypothesis test cut_e = len(e_list) - 1 while cut_e > 0: if e_list[cut_e] < self.cutoff: cut_e -= 1 else: break o_list = o_list[:cut_e + 1] e_list = e_list[:cut_e + 1] # end improve # leisi facha eva_val = math.sqrt( sum( list( map(lambda x: ((x[0] - x[1])**2), list(zip(o_list, e_list)))))) print('%s Euclidean D: %f' % (one, eva_val)) one_pvalue = stats.chisquare(o_list, e_list).pvalue print('%s Chi-Square : %f' % (one, one_pvalue)) # analyse two lambd node_count = eval('self.%s_cnt' % (two)) pwl_ins.set_nodes(node_count) o_list = self.statistic(eval(two + '_array')) pwl_ins.dtmn_max_degree() _, _, e_list = pwl_ins.dtmn_max_degree_2() e_list = e_list[:self.cutted_len] if self.cutted_len > len(e_list): o_list = o_list[:len(e_list)] # improve effect of hypothesis test cut_e = len(e_list) - 1 while cut_e > 0: if e_list[cut_e] < self.cutoff: cut_e -= 1 else: break o_list = o_list[:cut_e + 1] e_list = e_list[:cut_e + 1] # end improve # leisi facha eva_val = math.sqrt( sum( list( map(lambda x: ((x[0] - x[1])**2), list(zip(o_list, e_list)))))) print('%s Euclidean D: %f' % (two, eva_val)) two_pvalue = stats.chisquare(o_list, e_list).pvalue print('%s Chi-Square : %f' % (two, two_pvalue)) return {one: one_pvalue, two: two_pvalue} except Exception as e: raise e
class TestMain(object): ''' test full process ''' def __init__(self, scheme): self.scheme = scheme self.db_name = scheme['alias'] + '.db' self.db_handler = Store(self.db_name) self.db_handler.connect() def db_exist(self): return os.path.exists(self.db_name) def create_db(self): try: crt_db_sql = tbl_sql(self.scheme) with open('ddl.sql', 'w') as f: f.write(crt_db_sql) self.db_handler.execute_script(crt_db_sql) print('[Create DB %s]: done' % (self.db_name)) except Exception as e: raise e def clear_db(self): try: dlt_sql = delete_sql(self.scheme) self.db_handler.execute_script(dlt_sql) print('[Clear DB %s]: done' % (self.db_name)) except Exception as e: raise e def main_proc(self, mode='local'): # TODO: ent_size = count_size(self.scheme) s_conn, c_conn = Pipe() ents = self.scheme['entity'] rels = self.scheme['relation'] ent_idx = {} ent_set = set() rel_set = set() rel_idx = {} idx = 0 for one_ent in ents: ent_idx[one_ent['alias']] = idx ent_set.add(one_ent['alias']) idx += 1 idx = 0 for one_rel in rels: rel_set.add(one_rel['alias']) rel_idx[one_rel['alias']] = idx idx += 1 # generate procedure ent_gen_dict = {} ent_gen_list = [] rel_gen_dict = {} rel_gen_list = [] for one_ent in ents: ent_gen_dict[one_ent['alias']] = GenFE(one_ent, ent_size, self.db_name) ent_gen_list.append(ent_gen_dict[one_ent['alias']]) for one_rel in rels: sc_name = one_rel['source'] tg_name = one_rel['target'] if sc_name == tg_name: rel_ins = GenDDR(self.db_name, one_rel, c_conn, ents[ent_idx[sc_name]], ent_size, mode=mode) else: rel_ins = GenDDR(self.db_name, one_rel, c_conn, ents[ent_idx[sc_name]], ent_size, ents[ent_idx[tg_name]], mode=mode) rel_gen_dict[one_rel['alias']] = rel_ins rel_gen_list.append(rel_ins) # build sub process and start them rel_proc_dict = {} for rel_name, gen_ins in rel_gen_dict.items(): rel_proc_dict[rel_name] = Process(target=gen_ins.start) rel_proc_dict[rel_name].start() # core processing flag_rel_p = [True] * len(rel_idx) flag_ent_p = [True] * len(ent_idx) try: while sum(flag_rel_p) > 0: ''' message format: (rel_name, ent_name or '', sign, max_index(for node) or rel_data(for relation) or '') ''' res = s_conn.recv() print('[Recv Msg]: from %s sub process' % (res[0])) assert res[0] in rel_set and res[1] in ent_set if res[2] == macro.ADD_NODE or res[2] == macro.FINISH_NODE: _ent_ = res[1] ent_gen_dict[_ent_].start(res[3]) if res[2] == macro.FINISH_NODE: flag_ent_p[ent_idx[res[1]]] = False print('[Node Generation]: Name %s, End' % (res[1])) elif res[2] == macro.ADD_EDGE: ist_sql = 'insert into %s values (?, ?)' % (res[0]) self.db_handler.insert_many(ist_sql, res[3]) print('[Edge Generation]: Name %s, Amount %s' % (res[0], len(res[3]))) elif res[2] == macro.FINISH_EDGE: flag_rel_p[rel_idx[res[0]]] = False print('[Edge Generation]: Name %s, End' % (res[0])) except Exception as e: raise e finally: self.db_handler.close() for name, proc in rel_proc_dict.items(): proc.terminate() print('End of the generation process.')