-
Notifications
You must be signed in to change notification settings - Fork 0
/
huna_crawler.py
127 lines (108 loc) · 3.96 KB
/
huna_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
from datetime import datetime
import difflib
import StringIO
import re
from time import sleep
import requests
from twx.botapi import TelegramBot
BOT_API_TOKEN=os.getenv('BOT_API_TOKEN')
CYCLE_TIME=60
CHAT_ID=-185049465
LASTFILENAME_LOG='filename.log'
DATA={
'hunahpus':{
'url': "http://www.hunahpusday.com",
'regexs': [
'data-slb-asset="[0-9]+"',
'SLB.View.assets, {"[0-9]+"',
'Object Caching [0-9]+/[0-9]+ objects using disk',
'Served from: hunahpusday.com @ (.*) by W3 Total Cache -->',
],
'facebook_urls': [],
'twitter_urls': []
}
}
def get_repeated_lines_content(
url='http://hunahpusday.com/breweries/?orderby=post_title&order=ASC&breweryNumber=-1',
line_regex='<h3 class="h2">(.*)</a></h3>',
content_regex='title="(.*)"',
):
response = requests.get(url)
lines = re.findall(line_regex, response.content)
content = []
for l in lines:
content = content + re.findall(content_regex, bl)
return content
def telegram_bot(url, regexs):
bot = TelegramBot(BOT_API_TOKEN)
bot.update_bot_info().wait()
print(bot.username)
bot.send_message(CHAT_ID, "[%s] Bot Starting" % str(datetime.now())).wait()
while (True):
diff = diff_html(url, regexs)
if diff:
for diff_message in diff:
ticket = ["Ticket", "ticket"]
print diff_message
for t in ticket:
print t
if t in diff_message:
bot.send_message(CHAT_ID, "TICKET WORD APPEARED IN A DIFF!!1111").wait()
sleep(2)
bot.send_message(CHAT_ID, "TICKET WORD APPEARED IN A DIFF!!1111").wait()
bot.send_message(CHAT_ID, "Last updates:\n %s" % diff_message).wait()
else:
print("LOG: No diff")
sleep(CYCLE_TIME)
def content_cleaner(content, regexs):
for regex in regexs:
content = re.sub(regex, '', content)
return content
def download_html(url, regexs, write=False):
request = requests.get(url)
if request.status_code == 200:
content = content_cleaner(content=request.content, regexs=regexs)
if write:
now = str(datetime.now())
now = now.replace(" ", "_").replace(":", "-").split(".")[0]
filename = "content_%s.html" % now
write_to_file(filename, content)
write_to_file(LASTFILENAME_LOG, filename)
return content
else:
print("Something went wrong accessing the URL. Status Code: %s" % request.status_code)
return "Error"
def write_to_file(file, data):
with open(file, "w") as fp:
fp.write(data)
def diff_html(url, regexs):
try:
with open(LASTFILENAME_LOG, "r") as fp:
last_filename = fp.read()
except IOError:
print("First time running, creating a new file")
download_html(url=url, regexs=regexs, write=True)
with open(LASTFILENAME_LOG, "r") as fp:
last_filename = fp.read()
try:
with open(last_filename, "r") as fp:
old_content = fp.read()
except IOError:
return "ERROR: FileNotFound %s" % last_filename
new_content = download_html(url=url, regexs=regexs, write=False)
new_content_buf = StringIO.StringIO(new_content)
old_content_buf = StringIO.StringIO(old_content)
diff = difflib.ndiff(new_content_buf.readlines(), old_content_buf.readlines())
differences = []
for line in diff:
if line.startswith("-"):
differences.append("Removed: %s" % line)
elif line.startswith("+"):
differences.append("Added: %s" % line)
# GENERATE THE FILE FOR THE NEXT COMPARISON
os.remove(last_filename)
download_html(url=url, regexs=regexs, write=True)
return differences
if __name__ == '__main__':
telegram_bot(url=DATA['hunahpus']['url'], regexs=DATA['hunahpus']['regexs'])