/
Collapser.py
116 lines (82 loc) · 3.02 KB
/
Collapser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from FileFunc import FileFunc
from twokenize import simpleTokenize
def split(filename):
tweets = FileFunc.read_file_into_list_unicode('tweets.csv')
data = []
for tweet in tweets:
tmp = tweet.split(',')
data.append({'user': tmp[0], 'tweet': simpleTokenize(tmp[1])})
return data
def get_hashtag_users(hashtag):
pass
def parse():
hashtags = FileFunc.read_file_into_list('allhashtags.csv') #gather hashtags
tweets = split('tweets.csv') #parse the tweet data
tweets.pop(0) #remove header
users = [] #collect all unique users
for tweet in tweets:
if tweet['user'] not in users:
users.append(tweet['user'])
# print(users)
# print(tweets)
tagdict = {}
for hashtag in hashtags: #initialize lists
tagdict[hashtag] = []
for tweet in tweets: #check all tweets, hashtags and map users to hashtags
for hashtag in hashtags:
if hashtag in tweet['tweet']:
if tweet['user'] not in tagdict[hashtag]:
tagdict[hashtag].append(tweet['user'])
# print(tweet) #add user to hashtag dictionary
edgemap = {}
for hashtag in tagdict:
if len(tagdict[hashtag]) > 1:
tagdict[hashtag].sort()
for hashtag in tagdict:
if len(tagdict[hashtag]) > 1:
for user1 in tagdict[hashtag]:
for user2 in tagdict[hashtag]:
if user1 != user2:
l = sorted((user1, user2))
key = (l[0], l[1])
if key in edgemap:
edgemap[key] += 1
else:
edgemap[key] = 1
# print(edgemap)
# print(tagdict)
with open('edge.csv', 'w') as f:
f.write('User1,User2,ComFreq\n')
with open('edge.csv', 'a') as f:
for edge in edgemap.items():
f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
def combine(filename1, filename2):
list1 = FileFunc.read_file_into_list(filename1)
list2 = FileFunc.read_file_into_list(filename2)
print(list1)
combinedMap = {}
list1.pop(0)
for e in list1:
tokens = e.split(',')
combinedMap[(tokens[0], tokens[1])] = 0
list2.pop(0)
for e in list2:
tokens = e.split(',')
combinedMap[(tokens[0], tokens[1])] = 0
for e in list1:
tokens = e.split(',')
combinedMap[(tokens[0], tokens[1])] += int(tokens[2])
for e in list2:
tokens = e.split(',')
combinedMap[(tokens[0], tokens[1])] += int(tokens[2])
with open('combinedEdges.csv', 'w') as f:
f.write('User1,User2,ComFreq\n')
with open('combinedEdges.csv', 'a') as f:
for edge in combinedMap.items():
f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
# print(combinedMap)
# for e in list2:
# combinedMap[]
if __name__ == '__main__':
# parse()
combine('edge.csv', 'data/User_Network_Table.csv')