forked from minimaxir/download-tweets-ai-text-gen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
download_tweets.py
171 lines (139 loc) · 5.32 KB
/
download_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import twint
from gingerit.gingerit import GingerIt
from langdetect import detect
import fire
import re
import csv
from tqdm import tqdm
import logging
from datetime import datetime
from time import sleep
import os
# Surpress random twint warnings
logger = logging.getLogger()
logger.disabled = True
def is_reply(tweet):
"""
Determines if the tweet is a reply to another tweet.
Requires somewhat hacky heuristics since not included w/ twint
"""
# If not a reply to another user, there will only be 1 entry in reply_to
if len(tweet.reply_to) == 1:
return False
# Check to see if any of the other users "replied" are in the tweet text
users = tweet.reply_to[1:]
conversations = [user['username'] in tweet.tweet for user in users]
# If any if the usernames are not present in text, then it must be a reply
if sum(conversations) < len(users):
return True
return False
def FilterTweet(tweet):
try:
if tweet == '':
return False
if(detect(tweet)!='en'):
return False
ginger_parser = GingerIt()
ginger_grammar_results = ginger_parser.parse(tweet)
ginger_corrections = ginger_grammar_results['corrections']
if len(ginger_corrections) >= 5:
return False
return True
except:
return False
def FilterFunction(str):
if str.isalnum:
return True
if str==' ':
return True
if str=='#':
return True
if str=='@':
return True
return False
def download_tweets(username=None, limit=1000, include_replies=False,
strip_usertags=False, strip_hashtags=False):
"""Download public Tweets from a given Twitter account
into a format suitable for training with AI text generation tools.
:param username: Twitter @ username to gather tweets.
:param limit: # of tweets to gather; None for all tweets.
:param include_replies: Whether to include replies to other tweets.
:param strip_usertags: Whether to remove user tags from the tweets.
:param strip_hashtags: Whether to remove hashtags from the tweets.
"""
assert limit % 20 == 0, "`limit` must be a multiple of 20."
pattern = r'http\S+|pic\.\S+|\xa0|…'
if strip_usertags:
pattern += r'|@[a-zA-Z0-9_]+'
if strip_hashtags:
pattern += r'|#[a-zA-Z0-9_]+'
# Create an empty file to store pagination id
with open('.temp', 'w', encoding='utf-8') as f:
f.write(str(-1))
print("Retrieving tweets for {}...".format(username))
with open('{}_tweets.txt'.format(username), 'w', encoding='utf8') as f:
w = csv.writer(f)
w.writerow(['tweets']) # gpt-2-simple expects a CSV header by default
pbar = tqdm(range(limit),
desc="Oldest Tweet")
for i in range((limit // 20) - 1):
tweet_data = []
# twint may fail; give it up to 5 tries to return tweets
for _ in range(0, 4):
if len(tweet_data) == 0:
c = twint.Config()
c.Store_object = True
c.Hide_output = True
c.Search = username
c.Limit = 40
c.Resume = '.temp'
c.Store_object_tweets_list = tweet_data
twint.run.Search(c)
# If it fails, sleep before retry.
if len(tweet_data) == 0:
sleep(1.0)
else:
continue
# If still no tweets after multiple tries, we're done
if len(tweet_data) == 0:
break
if i > 0:
tweet_data = tweet_data[20:]
if not include_replies:
tweets = [re.sub(pattern, '', tweet.tweet).strip()
for tweet in tweet_data
if not is_reply(tweet)]
# On older tweets, if the cleaned tweet starts with an "@",
# it is a de-facto reply.
for tweet in tweets:
if tweet != '' and not tweet.startswith('@'):
tweet = ''.join(filter(FilterFunction, tweet))
if FilterTweet(tweet):
w.writerow([tweet])
else:
limit = limit + 1
else:
limit = limit + 1
else:
tweets = [re.sub(pattern, '', tweet.tweet).strip()
for tweet in tweet_data]
for tweet in tweets:
tweet = ''.join(filter(FilterFunction, tweet))
if FilterTweet(tweet):
w.writerow([tweet])
else:
limit = limit + 1
if i > 0:
pbar.update(20)
else:
pbar.update(40)
if tweet_data:
oldest_tweet = (datetime
.utcfromtimestamp(tweet_data[-1].datetime / 1000.0)
.strftime('%Y-%m-%d %H:%M:%S'))
pbar.set_description("Oldest Tweet: " + oldest_tweet)
pbar.close()
os.remove('.temp')
if __name__ == "__main__":
username = 'Fake News'
download_tweets(username = username, limit = 100000)