/
networkxstuff.py
291 lines (259 loc) · 9.89 KB
/
networkxstuff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import sqlite3 as lite
import sys
import networkx as nx
import pickle
import matplotlib.pyplot as plt
from classtweetgetter import DBTweetGetter
from time import sleep
import string
import sys
valid_characters = string.ascii_letters
valid_characters += string.digits
valid_characters += '_'
#loop through all retweets
#Tag same retweets somehow
# Find one closest in time - check if following
# Repeat until found then assume that was intermediate source
# See how many find successful sources - probably missing tonnes of data
#ignore double tweets from same user
#Take text after RT @user: and try to match same tweets?
#So write part to remove urls, user mentions
#Unabbreviate text, compare edit distance for typos
# What does it mean that the original source is different?
# Can we solve the original, original source?
# Need to include modified retweets too - drop RT requirement just use edit distance, etc.
# Also gets original
#Use URLS - can get netork by looping over URLs - no assumptions on text, create new thread when not retweet
#Can then see different threads from Tweet buttons on articles
#Can then plot propagation against time
# Write expansion for:
# http://fb.me/vUfScHp3
# http://flpbd.it/cly7C
# http://bit.ly/TI9AR5
# http://j.mp/Qj3YST
# http://ow.ly/fmgbG
# http://zite.to/XhGeRV
# http://vsb.li/676sfl
# http://tinyurl.com/akocj6z
# http://lnkd.in/kycnMe
# http://buff.ly/XMQ9hw
# Perhaps just visit URL then get final URL, strip of # tags
# Remove any # markers from URL
# http://grist.org/news/if-youre-27-or-younger-youve-never-experienced-a-colder-than-average-month/#.UKeZolfmFBx.twitter
def dbplotnaivertnetwork(dbname,tablename):
#Plot naive retweet network i.e. retweeters to original source
con = lite.connect(dbname)
mintime=1358090418
cur=con.cursor()
cur.execute("SELECT ScreenName, RetweetSource FROM "+tablename+" WHERE IsRetweet=1 AND RetweetSource<>'' AND ConvertedTime>" + str(mintime))
data=cur.fetchall()
graph=nx.DiGraph()
for item in data:
try:
graph[item[1].lower()][item[0].lower()]['weight']+=1
except:
graph.add_edge(item[1].lower(), item[0].lower(), weight=1)
print "Built graph"
nx.write_gml(graph, "naivertnew" + tablename+ ".gml")
print "Wrote graph"
# nx.draw(graph)
# print "Drew graph"
# plt.savefig("test.png")
# print "Saved graph"
# plt.show()
# print "Shown graph"
def dbplotffnetwork():
graph=nx.DiGraph()
#Plot following/follower network
#restrict to those with 3 tweets or more
mintime=1358090418
maxtime=1363963163
mygetter=DBTweetGetter(None,None)
con = lite.connect("tweetsdb.db")
cur=con.cursor()
ucon = lite.connect("userdb.db")
ucur=ucon.cursor()
tusers=[]
users=[]
cur.execute("SELECT ScreenName FROM htglobalwarming WHERE ConvertedTime > "+str(mintime) +" AND ConvertedTime < " + str(maxtime) + " COLLATE NOCASE")
temp=cur.fetchall()
for item in temp:
tusers.append(item[0].lower())
for item in tusers:
if not (item in users):
if tusers.count(item)>7:
users.append(item)
print len(users)
# cur.execute("SELECT ScreenName FROM htclimatechange WHERE ConvertedTime > "+str(mintime)+" AND ConvertedTime < " + str(maxtime) + " COLLATE NOCASE")
# temp=cur.fetchall()
# tusers=[]
# for item in temp:
# tusers.append(item[0].lower())
# for item in tusers:
# if not (item in users):
# if tusers.count(item)>29:
# users.append(item)
# print len(users)
# cur.execute("SELECT ScreenName FROM htagw WHERE ConvertedTime > "+str(mintime) + " COLLATE NOCASE")
# temp=cur.fetchall()
# tusers=[]
# for item in temp:
# tusers.append(item[0].lower())
# for item in tusers:
# if not (item in users):
# if tusers.count(item)>2:
# users.append(item)
# print len(users)
#aim for 380
#sys.exit("Hammertime")
i=0
try:
users.remove("undercoverzen")
users.remove("jivelad")
users.remove("anabananazavala")
#TODO Formalise this
except:
pass
for user in users:
print "User " + str(i)+"/"+str(len(users))
i+=1
#For each user check which other users are in friends, followers
ucur.execute("SELECT FriendId FROM friends WHERE ScreenName='"+user.lower()+"' COLLATE NOCASE")
frl=[]
temp=ucur.fetchall()
skip=False
if len(temp)==0:
#get friends
print "Downloading friends for " + user.lower()
friendslist=mygetter.getFriends(user.lower(), [], -1)
if friendslist!="FAIL":
for friend in friendslist:
ucur.execute("INSERT INTO friends VALUES('" + user.lower() + "'," + str(friend) + ")" )
frl=friendslist
else:
skip=True
try:
users.remove(user.lower())
except:
pass
sleep(10)
else:
for item in temp:
frl.append(item[0])
ucur.execute("SELECT FollowerId FROM followers WHERE ScreenName='"+user+"' COLLATE NOCASE")
fol=[]
temp=ucur.fetchall()
skip=False
if len(temp)==0:
#get friends
print "Downloading followers for " + user.lower()
followerslist=mygetter.getFollowers(user.lower(), [], -1)
if followerslist!="FAIL":
for follower in followerslist:
ucur.execute("INSERT INTO followers VALUES('" + user.lower() + "'," + str(follower) + ")" )
fol=followerslist
else:
skip=True
try:
users.remove(user.lower())
except:
pass
sleep(10)
else:
for item in temp:
fol.append(item[0])
ucon.commit()
if skip==False:
graph.add_node(user.lower())
for other in users:
skip2=False
ucur.execute("SELECT UserId FROM usermap WHERE ScreenName='"+other.lower()+"' COLLATE NOCASE")
temp=ucur.fetchall()
if len(temp)==0:
#get ID from web
print "Downloading userid for " + other.lower()
x=mygetter.getIDfromUser(other.lower())
if x!="FAIL":
ucur.execute("INSERT INTO usermap VALUES('" +other.lower()+ "'," + x + ")" )
sid=x
ucon.commit()
else:
try:
users.remove(other.lower())
except:
pass
skip2=True
sleep(10)
else:
sid=temp[0][0]
if skip2==False:
if sid in fol:
graph.add_edge(other.lower(), user.lower())
if sid in frl:
graph.add_edge(user.lower(), other.lower())
print "Built graph"
nx.write_gml(graph, "newfriendfollowerhtccgt29.gml")
ucon.commit()
con.close()
ucon.close()
print "Wrote graph"
# nx.draw(graph)
# plt.show()
def dbconversation(tablename):
#creates edges from OP to mentioned
graph=nx.DiGraph()
#Plot following/follower network
#restrict to those with 3 tweets or more
mygetter=DBTweetGetter(None,None)
con = lite.connect("tweetsdb.db")
cur=con.cursor()
mintime=1358090418
cur.execute("SELECT DISTINCT ScreenName FROM "+tablename+" WHERE ConvertedTime>"+str(mintime) + " AND IsRetweet=0")
l=cur.fetchall()
users=[]
for item in l:
users.append(item[0].lower())
cur.execute("SELECT Tweet, ScreenName FROM "+tablename+" WHERE ConvertedTime>"+str(mintime) + " AND IsRetweet=0")
d=cur.fetchall()
lz=len(d)
z=1
for item in d:
#print "Tweet " +str(z)+"/"+str(lz)
z+=1
if ("@" in item[0].lower()) and ("rt:" not in item[0].lower()):
#Continue until character not in valid set, then check if is user in set
#First count number of @s
names=[]
c=item[0].lower().count("@")
start=0
for i in range(c):
s=item[0].lower().index("@", start)
start=s+1
k=0
try:
j=item[0][start]
except:
print item[0]
while j in valid_characters:
k+=1
try:
j=item[0][start+k]
except:
j="/"
names.append(item[0][start:start+k].lower())
for name in names:
if name.lower() in users and name.lower()!=item[1].lower():
try:
graph[item[1].lower()][ name.lower()]['weight']+=1
except:
graph.add_edge(item[1].lower(),name.lower(), weight=1)
#graph.add_edge(item[1].lower(), name.lower())
print "Built graph"
nx.write_gml(graph, "newconv"+tablename+"nortdir.gml")
print "Wrote graph"
# plotnaivertnetwork("twitterdata_query=Global_Warming_time=02_12_2237_lastid=274513338904506370.pkl")
#plotffnetwork("twitterdata_query=#climatechange_time=21_11_1741_lastid=269835693721792511.pkl", "usersdict_twitterdata_query=#climatechange_time=21_11_1741_lastid=269835693721792511.pkl")
#retweetplot("twitterdata_query=#climatechange_time=21_11_1741_lastid=269835693721792511.pkl", "usersdict_twitterdata_query=#climatechange_time=21_11_1741_lastid=269835693721792511.pkl")
#dbplotnaivertnetwork("tweetsdb.db", "htagw")
#dbplotffnetwork()
dbconversation("htglobalwarming")