-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_trends.py
93 lines (84 loc) · 3.59 KB
/
scrape_trends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from twitter import OAuth, Twitter
from credentials import ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET
from markov_chain import MarkovChain
from markov_algorithms import *
oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twit = Twitter(auth=oauth, retry=2)
# print(twit.trends.closest(lat=37.7749, long=-122.4194))
# results = twit.trends.place(_id=1)[0]['trends'] # _id=1 for worldwide
results = twit.trends.place(_id=23424977)[0]['trends'] # _id=2487956 for US
# results = twit.trends.place(_id=2487956)[0]['trends'] # _id=2487956 for SF
trends = []
for r in results:
if r['name'].startswith('#'):
trends.append(r['name'])
print(trends, '\n')
EXCLUDE_WORDS = re.compile(r'#prolife|#chooselife|#rape|#raping|#trump|#maga|pedophile|#fakenews|nigger', re.I)
TEXT_ONLY = re.compile('[^A-Z0-9 .,+=!?&@_/#$%^*;:\'"()[\\]{}-]', re.I)
RETWEET = re.compile(r'\s?RT\s')
USER_NAME = re.compile(r'@\S+', re.I)
LINKS = re.compile(r'https?\S*', re.I)
AMPERSAND = re.compile(r'&', re.I)
GT = re.compile(r'>', re.I)
LT = re.compile(r'<', re.I)
LONE_PUNCTUATION = re.compile(r'\s[^a-zA-Z0-9_]\s')
TYPO_HASHTAGS = re.compile(r'\w+#\w+', re.I)
def fix_hashtag(matchobj):
fix = matchobj.group(0).split('#')
return ' #'.join(fix)
TYPO_PERIOD = re.compile(r'\w+\.\w+', re.I)
def fix_period(matchobj):
fix = matchobj.group(0).split('.')
return '. '.join(fix)
TYPO_QUESTION = re.compile(r'\w+\?\w+', re.I)
def fix_question(matchobj):
fix = matchobj.group(0).split('?')
return '? '.join(fix)
TYPO_EXCLAMATION = re.compile(r'\w+\!\w+', re.I)
def fix_exclamation(matchobj):
fix = matchobj.group(0).split('!')
return '! '.join(fix)
chain = MarkovChain()
for trend in trends:
print(f'trend: {trend}')
tweets = twit.search.tweets(q=trend, count=100, tweet_mode='extended', lang='en')
for t in tweets['statuses']:
if EXCLUDE_WORDS.search(t['full_text']) is None:
tweet = TEXT_ONLY.sub(' ', t['full_text'])
tweet = RETWEET.sub(' ', tweet)
tweet = USER_NAME.sub(' ', tweet)
tweet = LINKS.sub(' ', tweet)
tweet = TYPO_HASHTAGS.sub(fix_hashtag, tweet)
tweet = TYPO_PERIOD.sub(fix_period, tweet)
tweet = TYPO_QUESTION.sub(fix_question, tweet)
tweet = TYPO_EXCLAMATION.sub(fix_exclamation, tweet)
tweet = LONE_PUNCTUATION.sub(' ', tweet)
tweet = AMPERSAND.sub('and', tweet)
tweet = GT.sub('>', tweet)
tweet = LT.sub('<', tweet)
chain.train(tweet)
for i in range(3):
if 'next_results' not in tweets['search_metadata']:
break
next_id = re.split(r'\D+', tweets['search_metadata']['next_results'])[1]
tweets = twit.search.tweets(q=trend, count=100, tweet_mode='extended', max_id=next_id, lang='en')
for t in tweets['statuses']:
if EXCLUDE_WORDS.search(t['full_text']) is None:
tweet = TEXT_ONLY.sub(' ', t['full_text'])
tweet = RETWEET.sub(' ', tweet)
tweet = USER_NAME.sub(' ', tweet)
tweet = LINKS.sub(' ', tweet)
tweet = AMPERSAND.sub('and', tweet)
tweet = TYPO_HASHTAGS.sub(fix_hashtag, tweet)
tweet = TYPO_PERIOD.sub(fix_period, tweet)
tweet = TYPO_QUESTION.sub(fix_question, tweet)
tweet = TYPO_EXCLAMATION.sub(fix_exclamation, tweet)
tweet = LONE_PUNCTUATION.sub(' ', tweet)
tweet = GT.sub('>', tweet)
tweet = LT.sub('<', tweet)
chain.train(tweet)
print(f'len(chain.tree): {len(chain.tree)}')
chain.bulk_adjust_weights(fitness_functions=[aw_mult(aw_favor_complexity, .001), aw_mult(aw_favor_punctuation, .00015), aw_mult(aw_favor_alternating_complexity, .1)], iterations=len(chain.tree))
chain.save_training('bin/twitter/trending.bin')
print(f'Sample tweet: {chain.generate_tweet(append_tag="Category: #trending")}')