def save_data(config, anew=True): # веса VK не используем, так как 98% из них одинаковы labels = pd.Series( to_numpy(load_vector(config.join("labels.bin"), 'i'), 'l')) num_nodes = len(labels) nodes = None if not config.directed: nodes = to_numpy(load_vector(config.join("nodes.bin"), 'I')) train, test, _ = train_test_partition(config, labels, nodes, anew=anew) train_test = np.concatenate([train, test]) y, num_classes, cities = get_target(config, labels, train_test) print("num_classes = %s" % num_classes) data = Data(y=y) data.num_nodes = num_nodes data.num_classes = num_classes data.train_mask = torch.zeros(num_nodes, dtype=torch.uint8) data.train_mask[train] = 1 data.test_mask = torch.zeros(num_nodes, dtype=torch.uint8) data.test_mask[test] = 1 data.cities = cities torch.save(data, config.join('data%s.pt' % config.postfix))
def analyze(basedir, typecode): # basedir='../data/twitter' # typecode = 'L' cities = pd.read_csv(join(basedir, "geography.csv"), header=None)[0] labels = to_numpy(load_vector(join(basedir, "labels.bin"), 'i')) nodes = tools.load_vector(join(basedir, "nodes.bin"), typecode) id2ix = MapU64U32() for ix, _id in enumerate(nodes): id2ix[_id] = ix found = array.array('f') with open(join(basedir, "post_geo.csv")) as f: reader = csv.reader(f) next(reader, None) # skip the headers for line in tqdm(reader): ix = id2ix[int(line[0])] l = labels[ix] if l > -1: city = cities[l] splited = line[1].split(',') try: index = splited[::2].index(city) except: continue counts = np.array(splited[1::2], dtype='I') # r = float(counts[index]) / counts.sum() r = counts[index] found.append(r) found = to_numpy(found) qs = [[.2, .3, .4, .5, .6, .7, .8, .9]] qs.append(np.quantile(found, qs[0])) qs = np.array(qs)
def load_data(config: Config): Data.num_node_features = property(lambda self: self.x.shape[1]) data = torch.load(config.pt_data_path) data.x = CSRMatrix(config.node_csr_data_path, config.node_csr_indices_path, config.node_csr_indptr_path, config.cpu_count) if config.directed: Data.num_edge_features = property(lambda self: self.edge_attr.shape[1]) data.edge_attr = CSRMatrix(config.edge_csr_data_path, config.edge_csr_indices_path, config.edge_csr_indptr_path, config.cpu_count) data.reverse_edge_map = to_numpy( load_vector(config.reverse_edge_map_path, 'I')) if not os.path.exists(config.stat_path): compute_mean_std(config, data) stat = torch.load(config.stat_path) data.stat = stat if config.directed: data.double_mu_edge = torch.cat([stat.mu_edge, stat.mu_edge]) data.double_std_edge = torch.cat([stat.std_edge, stat.std_edge]) for part_path in config.part_dirs: print(part_path) torch.cuda.empty_cache() if not config.dev_mode: data.predict_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) if config.directed: findex = join(part_path, "predict_index.bin") else: findex = join(part_path, "nodes.bin") data.predict_mask[load_vector(findex, 'I')] = 1 data.edge_index = torch.from_numpy( load_2d_vec(join(part_path, "colrow.bin"), nrows=2, typecode='i', order='F')) deg = None if config.directed: deg = torch.from_numpy( to_numpy(load_vector(join(part_path, "degrees.bin"), 'I'), 'l')) nbr_sampler = NeighborSampler(config, data, deg=deg, batch_size=1000) yield nbr_sampler
def check_russian(): nodes = tools.load_vector('../data/facebook/nodes.bin', 'L') id2ix = get_id2ix(nodes) labels = tools.to_numpy( tools.load_vector('../data/facebook/labels.bin', 'i')) ixs = [] with open('../data/facebook/queue2.txt') as f: for s in f: try: ixs.append(id2ix[int(s)]) except: continue # я не взял профили без ребер, это логично sub = labels[ixs] sub[sub > -1].shape
def train_test_partition(config, labels, nodes=None, anew=False): """ Sample active users with cities, filter nonfrequent cities, return train/test split """ ftest = config.join("test_index.bin") ftrain = config.join("train_index.bin") fpredict = config.join("predict_index.bin") # sample only users having at least 30 friends with cities if anew: degrees_cities = to_numpy( load_vector(config.join("degrees_with_cities.bin"), 'f')) is_active_cities = degrees_cities > config.degree_city_threshold if config.directed: is_node_active = is_active_cities else: degrees = to_numpy(load_vector(config.join("degrees.bin"), 'I')) is_active = np.logical_and(degrees > config.degree_threshold, is_active_cities) is_node = np.zeros(len(labels), dtype=np.bool) is_node[nodes] = True is_node_active = is_node & is_active is_city = to_numpy(load_vector(config.join("is_city.bin"), 'B')) train_test = labels[(is_city == 1) & is_node_active] # predict = labels[~is_city & is_node_active] predict = labels[is_node_active] city_freq = train_test.value_counts() is_valid = np.zeros(city_freq.index.max() + 1, dtype=np.bool) # exclude very infrequent valid_cities = city_freq[city_freq > config.city_freq_threshold].index is_valid[valid_cities] = True train_test = train_test[is_valid[train_test]] train_size = min(config.max_train_size, len(train_test) - config.test_size) train, test = train_test_split(train_test, stratify=train_test, test_size=config.test_size, train_size=train_size, random_state=11) with open(ftest, 'wb') as f1, \ open(ftrain,'wb') as f2, \ open(fpredict,'wb') as f3: test = array.array('I', test.index) test.tofile(f1) train = array.array('I', train.index) train.tofile(f2) predict = array.array('I', predict.index) predict.tofile(f3) else: test = load_vector(ftest, 'I') train = load_vector(ftrain, 'I') predict = load_vector(fpredict, 'I') return train, test, predict
def compute_mean_std(config, data): def compute(x, ixs, chunk_size): device = config.device total = math.ceil(len(ixs) / chunk_size) s1 = torch.zeros(x.shape[1], dtype=torch.float32).to(device) s2 = torch.zeros(x.shape[1], dtype=torch.float32).to(device) counts = torch.zeros(x.shape[1], dtype=torch.int64).to(device) for ch_ixs in tqdm(tools.grouper(ixs, chunk_size), total=total, desc="mean/std"): ch_ixs = np.array(ch_ixs, dtype='I') chunk = torch.from_numpy(x[ch_ixs]).to(device) s1 += chunk.sum(axis=0) s2 += torch.square(chunk).sum(axis=0) counts += (chunk != 0).sum(dim=0) mu = s1 / x.shape[0] var = s2 / x.shape[0] - torch.square(mu) std = torch.pow(var, 0.5) std[std == 0] = 1. return counts, mu, std if config.directed: ixs = range(data.x.shape[0]) else: ixs = load_vector(config.join("nodes.bin"), "I") counts_node, mu_node, std_node = compute(data.x, ixs, 10**6) lessthen = (counts_node < config.city_freq_threshold).sum() if lessthen: print("node features having less then %d counts: %d" % (config.city_freq_threshold, lessthen)) counts_edge, mu_edge, std_edge = None, None, None if config.directed: ixs = range(data.edge_attr.shape[0]) counts_edge, mu_edge, std_edge = compute(data.edge_attr, ixs, 10**7) stat = Data(counts_node=counts_node, mu_node=mu_node, std_node=std_node, counts_edge=counts_edge, mu_edge=mu_edge, std_edge=std_edge) torch.save(stat, config.stat_path)
def show(): u64_nodes = tools.load_vector('../data/facebook/nodes.bin', 'L') edge_index = tools.load_2d_vec( '../data/facebook/colrow.bin', # edgelist_double.bin nrows=2, typecode='i', order='F') # ====================================================================== edge_map = tools.load_vector("../data/facebook/reverse_edge_map.bin", 'I') print_reverse_edges(2345, edge_index, edge_map) # ====================================================================== # id2ix = get_id2ix(u64_nodes) x = CSRMatrix('../data/facebook/train/neib_ftrs_data.bin', '../data/facebook/train/neib_ftrs_indices.bin', '../data/facebook/train/neib_ftrs_indptr.bin', multiprocessing.cpu_count()) labels = tools.to_numpy( tools.load_vector("../data/facebook/labels.bin", 'i')) geo_splited = np.genfromtxt('../data/facebook/geography_splited.csv', str) geography = np.genfromtxt('../data/facebook/geography.csv', dtype=str) extid = 100001046296473 query(extid, u64_nodes, labels, geography, group=True) print() pprint(get_neib_counts(extid, x, u64_nodes, geo_splited)) # ====================================================================== weights = CSRMatrix('../data/facebook/edge_ftrs_data.bin', '../data/facebook/edge_ftrs_indices.bin', '../data/facebook/edge_ftrs_indptr.bin', multiprocessing.cpu_count()) # cond = np.all(weights, axis=0) # eall = edge_index[:, cond] # wall = weights[:, cond] cond = np.array([23144], dtype='I') print_ix(cond, edge_index, weights, u64_nodes) ix1 = tools.binary_search(u64_nodes, 100009116587703) ix2 = tools.binary_search(u64_nodes, 100001989967439) cond = np.where(np.logical_and(edge_index[0] == ix1, edge_index[1] == ix2))[0].astype('I') print_ix(cond, edge_index, weights, u64_nodes) cond = np.where(np.logical_and(edge_index[0] == ix2, edge_index[1] == ix1))[0].astype('I') print_ix(cond, edge_index, weights, u64_nodes) # 100003289482076 # co_Ukraine.r1_KievObl.ci_Kiev 100009116587703 100001989967439
def predict(nbr_sampler, config): data = nbr_sampler.data model, _ = load_model(config, data) model.eval() device = config.device thresholds = pd.read_csv(config.join('thresholds.csv')) thresholds1 = load_thresholds(thresholds, config.precision_min1, device) thresholds2 = load_thresholds(thresholds, config.precision_min2, device) geos = open(config.join('cities_splited.csv')).read().split() if config.directed: nodes = to_numpy(load_vector(config.join("nodes.bin"), 'L')) def last_starts_with(prefix): return torch.tensor( [s.split('.')[-1].startswith(prefix) for s in geos]).to(device) r2_mask = last_starts_with('r2_') r3_mask = last_starts_with('r3_') city_mask = tools.get_city_mask(geos, exclude_continent=False) city_mask = torch.tensor(city_mask).to(device) no_interest = ~(r2_mask | r3_mask | city_mask) output_queue = multiprocessing.Queue() writer_process = multiprocessing.Process(target=__writer, args=(config, output_queue, geos, city_mask.cpu().numpy())) writer_process.start() with torch.no_grad(): total = tqdm_total(data.predict_mask, nbr_sampler.batch_size) for data_flow in tqdm(nbr_sampler(data.predict_mask), total=total): x = slice_data_flow(config, data, data_flow) logits = model(x, data_flow.to(device)) logits[:, no_interest] = -inf logits[logits < thresholds2] = -inf logits[logits < thresholds1] *= -1 cond = logits.min(dim=1)[0] < inf logits, ixs = torch.sort(logits[cond], dim=1) # break n_id = data_flow.n_id[cond].numpy() if config.directed: n_id = nodes[n_id] output_queue.put_nowait( (n_id, ixs.cpu().numpy(), logits.cpu().numpy())) # _id = 527 # _ixs = ixs[_id].cpu().detach().numpy() # _probs = torch.exp(logits[_id]).cpu().detach().numpy() # df = pd.DataFrame([np.array(data.cities)[_ixs], _probs]).T # df.columns = ['geo', 'prob'] # df['extid'] = data_flow.n_id[_id].detach().numpy() # df['th'] = torch.exp(thresholds)[_ixs].cpu().detach().numpy() # df['diff'] = df['prob'] - df['th'] # df = df[['extid', 'geo', 'prob', 'th', 'diff']] # df.to_csv('../data/vk/example.csv', index=False) # put_nowait output_queue.put(None) writer_process.join()