user_miss_loc = {} with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1]) for i in range(0, len(checkins), 2)] # checkins = [el for el in checkins if el[1] == '?' or loc[el[1]]['country'] == 'US'] for i, checkin in enumerate(checkins): if checkin[1] != '?': continue if user not in user_miss_loc: user_miss_loc[user] = [] if i != 0 and checkins[i - 1][1] != '?': user_miss_loc[user].append( (loc[checkins[i - 1][1]]['lat'], loc[checkins[i - 1][1]]['lon'])) if i != len(checkins) - 1 and checkins[i + 1][1] != '?': user_miss_loc[user].append( (loc[checkins[i + 1][1]]['lat'], loc[checkins[i + 1][1]]['lon'])) save_pkl('tmp/user_miss_loc.pkl', user_miss_loc)
#!/usr/bin/env python3 from utils.io import read_pkl, save_pkl if __name__ == '__main__': in_file = 'manual/reduce.txt' tag2class = {} target = '' with open(in_file, 'r') as f: for line in f: if line[0] != ' ': target = line.rstrip('\n').split(':')[1] continue tag2class[line[2:-1]] = target print(len(tag2class)) save_pkl('tmp/tag2class.pkl', tag2class)
from utils.io import read_pkl, save_pkl if __name__ == '__main__': user_checkins = read_pkl('tmp/user_checkins.pkl') loc_db = read_pkl('tmp/location.pkl') nodes = read_pkl('tmp/nodes.pkl') node_features = read_pkl('tmp/features.pkl') for i, node in enumerate(nodes): if node[-1] != '?': continue if np.sum(node_features[i][24:]) > 0: continue user = node[:-2] group_features = np.zeros((6, 1)) for checkin in user_checkins[user]: if checkin in loc_db: g = loc_db[checkin]['group'] group_features[g][0] += 1 group_features = normalize(group_features, axis=0) for j in range(6): node_features[i][j + 24] = group_features[j] save_pkl('tmp/features.pkl', node_features)
checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1])for i in range(0, len(checkins), 2)] checkins = [el for el in checkins if el[1] != '?'] # checkins = [el for el in checkins if el[1] != '?' and loc_db[el[1]]['country'] == 'US'] for checkin in checkins: if checkin[1] not in loc_in_checkins: loc_in_checkins[checkin[1]] = [loc_db[checkin[1]]['lat'], loc_db[checkin[1]]['lon']] candidate = read_pkl('tmp/candidate.pkl') for cand in candidate: if cand not in loc_in_checkins: loc_in_checkins[cand] = [loc_db[cand]['lat'], loc_db[cand]['lon']] loc_id_coord = [(k, loc_in_checkins[k]) for k in loc_in_checkins] loc_coord = [el[1] for el in loc_id_coord] loc_coord = np.array(loc_coord) cluster = KMeans(n_clusters=6) cluster.fit(loc_coord) for i in range(len(loc_coord)): loc_name = loc_id_coord[i][0] group_id = cluster.labels_[i] loc_db[loc_name]['group'] = group_id save_pkl('tmp/location.pkl', loc_db)
continue t = loc_db[node]['tag'] if t not in [el[0] for el in keep_tags]: continue c = loc_db[node]['class'] if c not in keep_classes: keep_classes[c] = 0 keep_classes[c] += 1 categorical = list(keep_classes) print('Num of classes:', len(categorical)) train_mask = [] labels = np.zeros((len(nodes), len(categorical))) for i, node in enumerate(nodes): if node[-1] == '?': continue c = loc_db[node]['class'] if c in categorical: train_mask.append(i) labels[i][categorical.index(c)] = 1 save_pkl('tmp/categorical.pkl', categorical) save_pkl('tmp/train_mask.pkl', train_mask) save_pkl('tmp/labels.pkl', labels)
out_dir = 'tmp' if not os.path.isdir(out_dir): os.makedirs(out_dir) loc_db = {} # tag2class = {} # ptr = -1 # # with open(in_file_1, 'r') as f: # for line in f: # if line[0] != ' ': # ptr += 1 # continue # # tag2class[word_normalize(line)] = ptr # print(tag2class) with open(in_file, 'r') as f: for line in f: data = line.rstrip('\n').split('\t') loc_db[data[0]] = { 'lat': float(data[1]), 'lon': float(data[2]), 'tag': word_normalize(data[3]), 'country': data[4], } save_pkl('%s/location.pkl' % out_dir, loc_db)
candidate = {} with open('raw/candidate_100_places.txt', 'r') as f: lines = f.readlines() lines = [el.rstrip('\n') for el in lines] for place in lines: candidate[place] = loc_db[place] tag = candidate[place]['tag'] label = tag if tag not in tag2class else tag2class[tag] loc_db[place]['class'] = label with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1])for i in range(0, len(checkins), 2)] for checkin in checkins: if checkin[1] == '?': # if checkin[1] == '?' or loc_db[checkin[1]]['country'] != 'US': continue tag = loc_db[checkin[1]]['tag'] label = tag if tag not in tag2class else tag2class[tag] loc_db[checkin[1]]['class'] = label save_pkl('tmp/candidate.pkl', candidate) save_pkl('tmp/location.pkl', loc_db)
#!/usr/bin/env python3 import numpy as np from utils.io import read_pkl, save_pkl if __name__ == '__main__': loc_db = read_pkl('tmp/location.pkl') user_checkins = {} c = 0 with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1]) for i in range(0, len(checkins), 2)] checkins = [el for el in checkins if el[1] in loc_db] # checkins = [el for el in checkins if el[1] in loc_db and loc_db[el[1]]['country'] == 'US'] if user not in user_checkins: user_checkins[user] = set() for checkin in checkins: user_checkins[user].add(checkin[1]) save_pkl('tmp/user_checkins.pkl', user_checkins)