forked from MaruthiBasava/abc-bird-url-script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
entry.py
85 lines (61 loc) · 1.96 KB
/
entry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from bs4 import BeautifulSoup
import requests
from linker import Linker
import re
from urllib.request import Request, urlopen
from urllib.error import HTTPError
link = Linker()
current_row = 0
def format_url():
base_url = "https://web.archive.org/web/"
date = "20001017050155/"
return base_url + date + link.getNextLink()
def format_title_to_url(result):
a = result.text
b = re.sub('-',' ', a).strip()
c = re.sub('\s+',' ', b).strip()
d = remove_non_ascii(c)
e = d.replace(" ","-")
return e
def get_title_from_wayback(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
result = soup.find('span', class_='H1')
if result is None:
result = soup.find('h1', {"align": "center"})
if result is None:
return "no-wayback"
return format_title_to_url(result)
def remove_non_ascii(s):
s.lower()
return "".join(i for i in s if (ord(i) == 32 or ord(i) >= 65 and ord(i) <= 90 or ord(i) >= 97 and ord(i) <= 122 or ord(i) >= 48 and ord(i)<= 57))
def return_working_title(url):
hdr = {'User-agent': 'Mozilla/5.0'}
req = Request(url, headers=hdr)
try:
page = urlopen(req).read
ppage = requests.get(url)
soup = BeautifulSoup(ppage.text, 'html.parser')
if "404 " in soup.text:
return "not in new site"
except HTTPError:
return "NOT ON NEW SITE"
return url
def add_all_links(a,b):
link.rowNum = a
for i in range(a,b):
a = get_title_from_wayback(format_url())
current_link = return_working_title("https://abcbirds.org/" + a)
print(str(i) + " " + current_link + " \n " + "https://abcbirds.org/" + a)
if a in 'no-wayback':
link.addNote("no-wayback-machine")
print('WAYBACK')
else:
link.addNote(current_link)
link.save()
try:
add_all_links(1,link.length())
except:
link.save()
add_all_links(current_row + 1)
raise