forked from ianozsvald/twitter_networkx_concept_map
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_tweet_updates_to_file.py
70 lines (60 loc) · 2.87 KB
/
extract_tweet_updates_to_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
"""Extract tweets between a set period (using filtering)"""
# -*- coding: utf-8 -*-
# http://www.python.org/dev/peps/pep-0263/
import datetime
import argparse
import pytz
import dateutil.parser as dt_parser
import functools
import json
import tweet_generators
def filter_until(tweets, filter_from, filter_to):
for tweet in tweets:
if tweet['created_at'] < filter_to:
if tweet['created_at'] > filter_from:
yield tweet
else:
raise StopIteration
if __name__ == "__main__":
filter_from = datetime.datetime.now() - datetime.timedelta(days=30)
filter_from = filter_from.replace(tzinfo=pytz.utc)
#filter_from_str = time.strftime("%Y-%m-%dT%H:%M%Z", filter_from.timetuple())
filter_from_str = filter_from.isoformat()
filter_to = datetime.datetime.now()
filter_to = filter_to.replace(tzinfo=pytz.utc)
filter_to_str = filter_to.isoformat()
#filter_to_str = time.strftime("%Y-%m-%dT%H:%M", filter_to.timetuple())
parser = argparse.ArgumentParser(description='Extract information from streaming tweet set')
parser.add_argument('--json-raw', nargs="*", help='Input to analyse e.g. tweets.json')
parser.add_argument('--ff', type=str, default=filter_from_str, help="Filter From date range, defaults to '--ff %s'" % (filter_from_str))
parser.add_argument('--ft', type=str, default=filter_to_str, help="Filter To date range, defaults to '--ff %s'" % (filter_to_str))
parser.add_argument('--text-file', help="Filename for just the tweet update text, one per line e.g. '--updates-file tweetsonly.txt'")
parser.add_argument('--output', "-o", help="Output to write e.g. -o coords.txt")
args = parser.parse_args()
print args.json_raw
all_json_lines = tweet_generators.files(args.json_raw)
tweets = tweet_generators.get_tweets(all_json_lines)
stream = tweet_generators.get_tweet_body(tweets)
# default will be to look at the last 30 days only
if args.ff:
filter_from = dt_parser.parse(args.ff)
filter_from = filter_from.replace(tzinfo=pytz.utc)
if args.ft:
filter_to = dt_parser.parse(args.ft)
filter_to = filter_to.replace(tzinfo=pytz.utc)
print("Filtering from {} to {}".format(filter_from, filter_to))
filter_until_partial = functools.partial(filter_until, filter_from=filter_from, filter_to=filter_to)
stream = filter_until_partial(stream)
if args.output:
outfile = open(args.output, "w")
dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
for tweet in stream:
outfile.write(json.dumps(tweet, default=dthandler) + "\n")
outfile.close()
if args.text_file:
outfile = open(args.text_file, "w")
for tweet in stream:
text = tweet['text'].encode('utf-8')
outfile.write(text + "\n")
outfile.close()