/
MysqlGetUrl.py
50 lines (40 loc) · 1.2 KB
/
MysqlGetUrl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python
'''
'''
from sqlalchemy import create_engine
from pandas import DataFrame
import urllib2
import sys
import re
# from bs4 import BeautifulSoup
def get_link(url):
link_exr = re.compile(r'<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>')
links = []
# open web content
f = urllib2.urlopen(url)
content = f.read()
# versi find html tag : find all url and save to links
# soup = BeautifulSoup(content, "lxml")
# for a in soup.find_all('a', href=True):
# if "detik.com" in a['href']:
# if "http:" not in a['href']:
# a['href'] = "http:" + a['href']
# print "Found the URL:", a['href']
# links.append(a['href'])
# versi regex : find all url and save to links
for link in link_exr.findall(content):
if "detik.com" in link[0]:
link_detik = link[0]
if "http:" not in link_detik:
link_detik = "http:" + link_detik
links.append(link_detik)
# save to DataFrame
df = DataFrame(links, columns=['detik url'])
df.drop_duplicates()
print df.head(0)
# create and save to sqlite database
detik_db = create_engine("mysql://root:root@localhost/data_detik")
df.to_sql('url_detik', detik_db, if_exists='replace')
if __name__ == "__main__":
url = sys.argv[1]
get_link(url)