def parse(self, browser): #data = re.findall('<a((([^=>]+)(=(([\'"]([^\'">]*)[\'"])|([^\'"\s\n>]*))*))*)>', page, re.S) data = re.findall('href=([\'"][^\'"]+[\'"]|[^\n\s>]*)', browser.page, re.S) if data: for path in data: i = data.index(path) if data.count(data[i]) > 1: data[i] = '' elif data[i].strip() != '': data[i] = re.sub(r'^[\'"](.*)[\'"]$',r'\1',data[i],re.S).strip() mail = re.search('mailto:(?P<email>[a-zA-Z0-9@_\-\.]*)',data[i].strip(),re.S) if mail: print 'mailto founded' # debugging if mail.group('email'): db.query('INSERT into `emails` (`time`,`url`,`email`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(browser.get_url())+'","'+db.escape_string(mail.group('email'))+'")') print 'email inserted' # debugging else: if not re.search('^http://',data[i]): if len(data[i]) > 0 and data[i][0] != '/': data[i] = '/'+data[i] data[i] = 'http://'+browser.host+data[i] if data.count(data[i]) == 1: db.query('SELECT `id` from `queue` WHERE `url`="'+db.escape_string(data[i])+'" AND `created`>UNIX_TIMESTAMP()-1209600') if not db.store_result().num_rows(): db.query('INSERT into `queue` (`created`,`url`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(data[i])+'")') elif data.count(data[i]) > 1: data.remove(data[i])
def get(self, id='', host=''): if id or host: if id: db.query('SELECT `id`,`host` FROM `hosts` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.host = res.fetch_row() elif host: self.host = host elif host: host = re.sub('^www.', '', host) db.query('SELECT `id`,`host` FROM `hosts` WHERE `host`="'+db.escape_string(host)+'"') res = db.store_result() if res.num_rows() == 1: self.id, self.host = res.fetch_row()[0] else: self.host = host
def next(self): db.query('SELECT * from `queue` WHERE `finished`=0 ORDER by `id` ASC LIMIT 1') res = db.store_result() if res.num_rows() == 1: self.query = res.fetch_row(how=1)[0] return self.query else: return False
def get(self, id='', host_id='', path=''): if id or path: if id: db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.host_id, self.path = res.fetch_row()[0] elif host_id and path: self.host_id = host_id self.path = path elif host_id and path: db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `host_id`="'+db.escape_string(str(host_id))+'" AND `path`="'+db.escape_string(path)+'"') res = db.store_result() if res.num_rows() == 1: self.id, self.host_id, self.path = res.fetch_row()[0] else: self.host_id = host_id self.path = path
def get(self, id='', page_id='', status=''): if id: db.query('SELECT `id`,`page_id`,`status` FROM `visits` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.page_id, self.status = res.fetch_row()[0] elif page_id and status: self.page_id = page_id self.status = status
def get(self, visit_id='', errors=''): if visit_id and not errors: self.visit_id = visit_id db.query('SELECT `id`,`visit_id`,`error_id` FROM `page_errors` WHERE `visit_id`='+db.escape_string(str(visit_id))) res = db.store_result() if res.num_rows(): while True: row = res.fetch_row()[0] if not row: break self.errors = self.errors + [res.fetch_row()[0][2]] elif visit_id and errors: self.visit_id = visit_id self.errors = errors
def get(self, id="", visit_id="", page_id="", browser=""): if id: db.query( "SELECT `id`,`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content` FROM `cache` WHERE `id`=" + db.escape_string(str(id)) ) res = db.store_result() if res.num_rows() == 1: self.id, self.visit_id, self.page_id, self.encoding, self.doctype, self.validity, self.content = res.fetch_row()[ 0 ] elif visit_id and page_id and browser: self.visit_id = visit_id self.page_id = page_id self.headers = browser.headers if browser.validate: self.encoding = browser.validator.charset self.doctype = browser.validator.doctype if browser.validator.validity: self.validity = int(browser.validator.validity) self.content = browser.page