-
Notifications
You must be signed in to change notification settings - Fork 0
/
nn_complete.py
63 lines (56 loc) · 2.06 KB
/
nn_complete.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import graphlab as gl
cands = gl.SArray('cand_words')
sf = gl.load_sframe('sydney_sf')
sf['bow'] = gl.text_analytics.count_words(sf['text'])
# Trim the keys to only the 'highly unusual ones'.
sf['bow'] = sf['bow'].dict_trim_by_keys(cands, exclude=False)
# Delete rows with no keys left.
sf['bow_key'] = sf['bow'].dict_keys()
sf = sf.filter_by([[]], 'bow_key', exclude=True)
del sf['bow_key']
del sf['text']
del sf['retweeted']
del sf['time']
# Convert user IDs to sparse boolean format
sf['user_id'] = sf['user_id'].apply(lambda x: {x: 1})
buckets = gl.load_sframe('time_buckets')
for b in buckets.column_names():
print 'Computing Neighbors for: ', b
cur_ids = buckets[b].unique()
cur_sf = sf.filter_by(cur_ids, 'mongo_id')
nn = gl.nearest_neighbors.create(
cur_sf, label='mongo_id', features=['user_id', 'bow'])
results = nn.query(cur_sf, label='mongo_id', k=None, radius=0.95)
print results.head(5)
print 'Deleting Loops...'
results['loops'] = results.apply(
lambda x: x['query_label'] == x['reference_label'])
results = results.filter_by([0], 'loops')
del results['loops']
results.save('buckets/' + b)
print 'Nearest Neighbors Complete!\n'
# Create Graphs
# reload the SFrame
sf = gl.load_sframe('sydney_sf')
print 'Creating Graphs...'
for i in range(1, 671):
print 'Creating Graph for B' + str(i)
cur_edges = gl.load_sframe('buckets/B' + str(i))
edge_verts = cur_edges['query_label'].append(cur_edges['reference_label'])
edge_verts = edge_verts.unique()
cur_verts = sf.filter_by(edge_verts, 'mongo_id')
g = gl.SGraph(cur_verts, cur_edges,
vid_field='mongo_id',
src_field='query_label',
dst_field='reference_label')
g.save('graphs/B' + str(i))
print 'Graph Creation Complete!\n'
# Calculate Components
print 'Calculating Components...'
for i in range(1, 671):
print 'Calculating Components for for B' + str(i)
g = gl.load_sgraph('graphs/B' + str(i))
cc = gl.connected_components.create(g)
cc.save('components/B' + str(i))
print 'Success!'
exit()