forked from ianozsvald/twitter_networkx_concept_map
/
extractor_content.py
237 lines (195 loc) · 10.1 KB
/
extractor_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python
"""1 liner to explain this project"""
# -*- coding: utf-8 -*-
# http://www.python.org/dev/peps/pep-0263/
import argparse
import json
import sys
import logging
from ttp import ttp
import maksim_utils
import networkx as nx
import matplotlib.pyplot as plt
import make_ngrams
import colloc_analysis
plt.__str__ # silly way to stop pylint error (use plt in IPython)
# Usage:
# load datasets into memory, also output a text file for later parsing
# $ %run extractor_content.py --json-raw /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pycon0.json /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pycon.json -o clean_pycon_withtweets.json
# $ %run extractor_content.py --json-cleaned clean_pycon_withtweets.json --remove-nodes #pycon #python #pycon2013 @pycon --write-graphml pyconout.graphml --remove-hashtags-below 3 --remove-usernames-below 15 --remove-phrases-below 3
# pydata
# $ %run extractor_content.py --json-raw /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pydata0.json /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pydata.json -o clean_pydata.json
# $ %run extractor_content.py --json-cleaned clean_pydata.json --remove-nodes #pydata --remove-usernames-below 0 --remove-phrases-below 0 --write-graphml pydataout.graphml --draw-networkx
# brighton
# $ %run extractor_content.py --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_snapshot4/tweets_brighton.json -o clean_brighton.json
# $ %run extractor_content.py --json-cleaned clean_brighton.json --remove-nodes #brighton --draw-networkx --write-graphml brightonout.graphml
# london
# $ %run extractor_content.py --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_snapshot4/tweets_london.json -o clean_london.json
# %run extractor_content.py --json-cleaned clean_london.json --remove-nodes #london --remove-hashtags-below 80 --remove-usernames-below 100 --remove-phrases-below 80 --draw-networkx --write-graphml londonout.graphml
# london fashion week
# %run extractor_content.py --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_all_lfw_london_fashion_week.json -o clean_londonfashionweek.json
LOG_FILE = "extractor.log"
logging.basicConfig(filename=LOG_FILE, level=logging.DEBUG)
def get_tweets(tweets):
"""Generator to return entry from valid JSON lines"""
for tweet in tweets:
# load with json to validate
try:
tw = json.loads(tweet)
yield tw
except ValueError as err:
logging.debug("Odd! We have a ValueError when json.loads(tweet): %r" % repr(err))
#def filter_http(tweets):
#"""Ignore links with http links (can be useful to ignore spam)"""
#for tweet in tweets:
#try:
#if 'http' not in tweet['text']:
#yield tweet
#except KeyError as err:
#logging.debug("Odd! We have a KeyError: %r" % repr(err))
def get_tweet_body(tweets):
"""Get tweets, ignore ReTweets"""
for tweet in tweets:
try:
if 'text' in tweet:
if not tweet['text'].startswith('RT'):
yield tweet
except KeyError as err:
logging.debug("Odd! We have a KeyError: %r" % repr(err))
def get_useful_information(tweet_parser, tweets):
"""Extract a set of useful information about the tweets that we want to graph"""
for tweet in tweets:
text = tweet['text']
# replace newlines with nothing
text = text.replace('\r', '')
text = text.replace('\n', '')
screen_name = tweet['user']['screen_name'].lower()
result = tweet_parser.parse(text)
hashtags = [tag.lower() for tag in result.tags]
users = [user.lower() for user in result.users]
items = {'hashtags': ['#' + h for h in hashtags], 'tweet': text, 'screen_name': screen_name, 'users': ['@' + usr for usr in users]}
yield items
def files(file_list):
"""Yield lines from a list of input json data files"""
for filename in file_list:
f = open(filename)
for line in f:
yield line
def add_node(G, node_name):
"""Add a node to graph, make a label, increase weight if seen before"""
typ = 2 # default to type 2 (phrase)
label = node_name
if node_name.startswith('#'):
typ = 0
label = node_name[1:]
if node_name.startswith('@'):
typ = 1
label = node_name[1:]
if not G.has_node(node_name):
G.add_node(node_name, label=label, type=typ, weight=-1)
G.node[node_name]['weight'] += 1
def build_and_trim_network(json_cleaned_lines, remove_nodes, remove_usernames_below, remove_hashtags_below, remove_phrases_below):
items = json_cleaned_lines
hashtag_net = nx.Graph()
top_collocations = colloc_analysis.extract_top_collocations(items)
for item in items:
# combine hashtags and users into one list of things to pair up
all_items = item['hashtags'] + item['users']
word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences(item['tweet'])
for word_sequence in word_sequences:
if len(word_sequence) > 1:
capitalised_words = " ".join(word_sequence)
capitalised_words = capitalised_words.lower() # normalise e.g. Github GitHub GITHUB -> github
all_items.append(capitalised_words)
# extract frequent collocations
tweet_cleaned_lowercased = " ".join(colloc_analysis.tweet_as_terms(item['tweet']))
for top_collocation in top_collocations:
tc = " ".join(top_collocation)
if tc in tweet_cleaned_lowercased:
all_items.append(tc) # add collocation phrase
# add nodes with a default weight
for item in all_items:
add_node(hashtag_net, item)
for t1 in all_items:
for t2 in all_items:
if t1 is not t2:
maksim_utils.add_or_inc_edge(hashtag_net, t1, t2)
for node in hashtag_net.nodes():
if node.startswith('@'):
if hashtag_net.node[node]['weight'] < remove_usernames_below:
hashtag_net.remove_node(node)
if node.startswith('#'):
if hashtag_net.node[node]['weight'] < remove_hashtags_below:
hashtag_net.remove_node(node)
if not node.startswith('#') and not node.startswith('@'):
# here if we have a phrase
if hashtag_net.node[node]['weight'] < remove_phrases_below:
hashtag_net.remove_node(node)
# remove nodes that too many people might be connected to
for removal in remove_nodes:
try:
hashtag_net.remove_node(removal)
except nx.NetworkXError as err:
logging.warning("Node %r not in the graph (error==%r)" % (removal, err))
# remove singularly connected nodes until none left
while True:
nbr_of_nodes = hashtag_net.number_of_nodes()
logging.info("Trimming, currently we have %d nodes" % (nbr_of_nodes))
hashtag_net = maksim_utils.trim_degrees(hashtag_net)
if hashtag_net.number_of_nodes() == nbr_of_nodes:
break
return hashtag_net
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract information from streaming tweet set')
parser.add_argument('--json-raw', nargs="*", help='Input to analyse e.g. tweets.json')
parser.add_argument('--output', "-o", help="Output to write (else stdout) e.g. -o pycon.json")
parser.add_argument('--json-cleaned', help='Cleaned input json')
parser.add_argument('--remove-nodes', nargs="*", default=[], help='Remove named nodes e.g. "--remove-nodes #pycon @pycon"')
parser.add_argument('--draw-networkx', action="store_true", help='Draw the graph using networkX')
parser.add_argument('--write-graphml', help='Filename for graphml output')
parser.add_argument('--remove-usernames-below', type=int, default=50, help='Remove usernames who are mentioned less than n times e.g. "--remove-usernames-below 50"')
parser.add_argument('--remove-hashtags-below', type=int, default=2, help='Remove hashtags that are mentioned less than n times e.g. "--remove-hashtagss-below 2"')
parser.add_argument('--remove-phrases-below', type=int, default=10, help='Remove phrases (>1 word) that are mentioned less than n times e.g. "--remove-phrases-below 10"')
args = parser.parse_args()
if args.json_raw:
tweet_parser = ttp.Parser()
# stream through a list of user-provided filenames
all_json_lines = files(args.json_raw)
tweets = get_tweets(all_json_lines)
# get tweets (ignore rubbish from streaming api), extract useful info
stream = get_tweet_body(tweets)
stream = get_useful_information(tweet_parser, stream)
if args.output:
output = open(args.output, 'w')
else:
output = sys.stdout # use stdout if no file specified
items = []
for item in stream:
outstr = json.dumps(item)
output.write("%s\n" % (outstr))
items.append(item)
if args.output:
output.close() # don't close sys.stdout by mistake
if args.json_cleaned:
items = []
for line in open(args.json_cleaned):
items.append(json.loads(line))
hashtag_net = build_and_trim_network(items, args.remove_nodes, args.remove_usernames_below, args.remove_hashtags_below, args.remove_phrases_below)
if args.draw_networkx:
# we can draw a network using networkx, optionally using graphviz
# for improved layout
graphviz = True
try:
import pygraphviz
pygraphviz.release.version # stupid statement to avoid pylint error
except ImportError as err:
graphviz = False
if graphviz:
logging.info("Drawing using GraphViz layout engine")
nx.draw_graphviz(hashtag_net, edge_color="b")
else:
logging.info("Drawing using NetworkX layout engine")
nx.draw_networkx(hashtag_net)
plt.show()
if args.write_graphml:
nx.write_graphml(hashtag_net, open(args.write_graphml, "w"))