/
afraidorg.py
148 lines (129 loc) · 4.23 KB
/
afraidorg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from BeautifulSoup import BeautifulSoup
import urllib
import logging
import time
import MySQLdb as mysql
from string import Template
### MySQL Parameters ###
MHOST = "localhost"
MUSER = "root"
MPASSWORD = ""
MDBNAME = "afraidorg"
### Site-Specific Parameters ###
url = "http://freedns.afraid.org/domain/registry/"
urlParameters = "page-$count.html"
SLEEP = 10
class AfraidorgParser:
pageCount = 1
def readRemoteDocument(self, pageCount):
finalUrl = self.url+self.urlParameters.substitute(count=pageCount)
retryCount = 0
for retryCount in xrange(10):
try:
self.logger.debug("Fetching remote document at " + finalUrl)
docHandle = urllib.urlopen(finalUrl)
doc = docHandle.read()
return doc
except IOError as e:
self.logger.debug(e)
continue
break
def mineContent(self):
#get the page count first
self.totalPage = self.getTotalPageNumber()
if(self.totalPage != None):
for self.pageCount in xrange(self.pageCount, self.totalPage+1):
tuples = []
#sleep before each connection to mimic
self.sleep()
retryCount = 0
for retryCount in xrange(10):
try:
doc = self.readRemoteDocument(self.pageCount)
soup = BeautifulSoup(doc)
rows = soup.center.table.findAll("tr")
for row in rows:
if(row.td.contents[0] != None and row.form == None and row.td.font == None):
tuple = []
domainName = row.contents[0].a.string.strip()
numberOfHosts = int(self.chunkString(row.contents[0].span.contents[0].string.strip(), 1, " ").strip("()"))
registerDate = self.chunkString(row.contents[3].string.strip(), 4, " ").strip("()")
tuple = (domainName, numberOfHosts, registerDate)
tuples.append(tuple)
except AttributeError as e:
self.logger.debug("broken document, retrying")
continue
break
self.insertToMySQL(tuples)
else:
self.logger.debug("Coulnd't get the total page count, quitting")
return
def getTotalPageNumber(self):
try:
doc = self.readRemoteDocument(self.pageCount)
soup = BeautifulSoup(doc)
#get all trs, total page number is in the last tr
rows = soup.center.table.findAll("tr")
for row in rows:
if(len(row.contents) > 2):
if(row.contents[2].font != None and len(row.contents[2].font.contents) > 1):
for str in row.contents[2].font.contents[2].split():
if str.isdigit():
self.logger.debug("Got the total page count: "+str)
return int(str)
return None
except Exception as e:
self.logger.debug(e)
except TypeError as e:
self.logger.debug(e)
def chunkString(self, str, index, splitter):
counter = 1
for chunk in str.split(splitter):
if(counter == index):
return chunk
counter += 1
def connectToMySQL(self):
try:
self.conn = mysql.connect(MHOST, MUSER, MPASSWORD, MDBNAME)
except Exception as e:
self.logger.debug(e)
def disconnectToMySQL(self):
mysql.close(self.conn)
def insertToMySQL(self, tuples):
for tuple in tuples:
try:
cursor = self.conn.cursor()
cursor.execute("INSERT INTO domains (domain_name, host_count, register_date) VALUES (%s, %s, %s)", tuple)
self.conn.commit()
except mysql.Error as e:
if(e.args[0] == 1062):
self.logger.debug("duplicated entry detected, updating db with fresh data")
self.updateMysql(tuple)
else:
self.logger.debug(e)
def updateMysql(self, tuple):
cursor = self.conn.cursor()
cursor.execute("UPDATE domains SET host_count = %s, register_date = %s where domain_name=%s", (tuple[1], tuple[2], tuple[0]))
self.conn.commit()
def sleep(self):
self.logger.debug("Sleeping for "+str(SLEEP)+" seconds");
time.sleep(SLEEP)
def __init__(self, url, urlParameters):
self.url = url
self.urlParameters = Template(urlParameters)
self.totalPage = 0
self.pageCount = 1
#logger
self.logger = logging.getLogger("afraidorgparser")
self.logger.setLevel(logging.DEBUG)
self.consoleHandler = logging.StreamHandler()
self.consoleHandler.setLevel(logging.DEBUG)
self.logger.addHandler(self.consoleHandler)
def main(self):
self.connectToMySQL()
self.mineContent()
if __name__ == "__main__":
parser = AfraidorgParser(url, urlParameters)
parser.main()