def next(self): db.query('SELECT * from `queue` WHERE `finished`=0 ORDER by `id` ASC LIMIT 1') res = db.store_result() if res.num_rows() == 1: self.query = res.fetch_row(how=1)[0] return self.query else: return False
def save(self): if self.visit_id and self.errors: db.query('DELETE from `page_errors` WHERE `visit_id`='+db.escape_string(str(self.visit_id))) for er in self.errors: db.query('INSERT into `page_errors` (`visit_id`,`error_id`) VALUES ("'+db.escape_string(str(self.visit_id))+'","'+db.escape_string(str(er))+'")') return True else: return False
def get(self, id='', page_id='', status=''): if id: db.query('SELECT `id`,`page_id`,`status` FROM `visits` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.page_id, self.status = res.fetch_row()[0] elif page_id and status: self.page_id = page_id self.status = status
def save(self): if self.host_id and self.path: if self.id: db.query('UPDATE `pages` SET `host_id`="'+db.escape_string(str(self.host_id))+'", `path`="'+db.escape_string(self.path)+'" WHERE `id`='+db.escape_string(str(self.id))) return True else: db.query('INSERT into `pages` (`host_id`,`path`) VALUES ("'+db.escape_string(str(self.host_id))+'","'+db.escape_string(self.path)+'")') self.id = db.insert_id() return True else: return False
def save(self): if self.host: if self.id: db.query('UPDATE `hosts` SET `host`="'+db.escape_string(self.host)+'" WHERE `id`='+db.escape_string(str(self.id))) return True else: db.query('INSERT into `hosts` (`host`) VALUES ("'+db.escape_string(self.host)+'")') self.id = db.insert_id() return True else: return False
def save(self): if self.page_id and self.status: if self.id: db.query('UPDATE `visits` SET `page_id`="'+db.escape_string(str(self.page_id))+'", `time`=UNIX_TIMESTAMP(), `status`="'+db.escape_string(str(self.status))+'" WHERE `id`='+db.escape_string(str(self.id))) return True else: db.query('INSERT into `visits` (`page_id`,`time`,`status`) VALUES ("'+db.escape_string(str(self.page_id))+'",UNIX_TIMESTAMP(),"'+db.escape_string(str(self.status))+'")') self.id = db.insert_id() return True else: return False
def get(self, visit_id='', errors=''): if visit_id and not errors: self.visit_id = visit_id db.query('SELECT `id`,`visit_id`,`error_id` FROM `page_errors` WHERE `visit_id`='+db.escape_string(str(visit_id))) res = db.store_result() if res.num_rows(): while True: row = res.fetch_row()[0] if not row: break self.errors = self.errors + [res.fetch_row()[0][2]] elif visit_id and errors: self.visit_id = visit_id self.errors = errors
def get(self, id='', host=''): if id or host: if id: db.query('SELECT `id`,`host` FROM `hosts` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.host = res.fetch_row() elif host: self.host = host elif host: host = re.sub('^www.', '', host) db.query('SELECT `id`,`host` FROM `hosts` WHERE `host`="'+db.escape_string(host)+'"') res = db.store_result() if res.num_rows() == 1: self.id, self.host = res.fetch_row()[0] else: self.host = host
def get(self, id='', host_id='', path=''): if id or path: if id: db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `id`='+db.escape_string(str(id))) res = db.store_result() if res.num_rows() == 1: self.id, self.host_id, self.path = res.fetch_row()[0] elif host_id and path: self.host_id = host_id self.path = path elif host_id and path: db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `host_id`="'+db.escape_string(str(host_id))+'" AND `path`="'+db.escape_string(path)+'"') res = db.store_result() if res.num_rows() == 1: self.id, self.host_id, self.path = res.fetch_row()[0] else: self.host_id = host_id self.path = path
def save(self): if self.visit_id and self.page_id: if self.id: db.query( 'UPDATE `cache` SET `visit_id`="' + db.escape_string(str(self.visit_id)) + '", `page_id`="' + db.escape_string(str(self.page_id)) + '", `encoding`="' + db.escape_string(self.encoding) + '", `doctype`="' + db.escape_string(self.doctype) + '", `validity`="' + db.escape_string(str(self.validity)) + '", `headers`="' + db.escape_string(str(self.headers)) + '", `content`="' + db.escape_string(self.content) + '" WHERE `id`=' + db.escape_string(str(self.id)) ) return True else: db.query( 'INSERT into `cache` (`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content`,`headers`) VALUES ("' + db.escape_string(str(self.visit_id)) + '","' + db.escape_string(str(self.page_id)) + '","' + db.escape_string(self.encoding) + '","' + db.escape_string(self.doctype) + '","' + db.escape_string(str(self.validity)) + '","' + db.escape_string(self.content) + '","' + db.escape_string(str(self.headers)) + '")' ) self.id = db.insert_id() return True else: return False
def get(self, id="", visit_id="", page_id="", browser=""): if id: db.query( "SELECT `id`,`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content` FROM `cache` WHERE `id`=" + db.escape_string(str(id)) ) res = db.store_result() if res.num_rows() == 1: self.id, self.visit_id, self.page_id, self.encoding, self.doctype, self.validity, self.content = res.fetch_row()[ 0 ] elif visit_id and page_id and browser: self.visit_id = visit_id self.page_id = page_id self.headers = browser.headers if browser.validate: self.encoding = browser.validator.charset self.doctype = browser.validator.doctype if browser.validator.validity: self.validity = int(browser.validator.validity) self.content = browser.page
def parse(self, browser): #data = re.findall('<a((([^=>]+)(=(([\'"]([^\'">]*)[\'"])|([^\'"\s\n>]*))*))*)>', page, re.S) data = re.findall('href=([\'"][^\'"]+[\'"]|[^\n\s>]*)', browser.page, re.S) if data: for path in data: i = data.index(path) if data.count(data[i]) > 1: data[i] = '' elif data[i].strip() != '': data[i] = re.sub(r'^[\'"](.*)[\'"]$',r'\1',data[i],re.S).strip() mail = re.search('mailto:(?P<email>[a-zA-Z0-9@_\-\.]*)',data[i].strip(),re.S) if mail: print 'mailto founded' # debugging if mail.group('email'): db.query('INSERT into `emails` (`time`,`url`,`email`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(browser.get_url())+'","'+db.escape_string(mail.group('email'))+'")') print 'email inserted' # debugging else: if not re.search('^http://',data[i]): if len(data[i]) > 0 and data[i][0] != '/': data[i] = '/'+data[i] data[i] = 'http://'+browser.host+data[i] if data.count(data[i]) == 1: db.query('SELECT `id` from `queue` WHERE `url`="'+db.escape_string(data[i])+'" AND `created`>UNIX_TIMESTAMP()-1209600') if not db.store_result().num_rows(): db.query('INSERT into `queue` (`created`,`url`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(data[i])+'")') elif data.count(data[i]) > 1: data.remove(data[i])
def delete(self): if self.id: db.query('DELETE from `pages` WHERE `id`='+db.escape_string(str(self.id))) return True else: return False
def delete(self): if self.id: db.query("DELETE from `cache` WHERE `id`=" + db.escape_string(str(self.id))) return True else: return False
def aborted(self): db.query('UPDATE `queue` SET `finished`=-1 WHERE `id`='+db.escape_string(str(self.query['id'])))
def finished(self): db.query('UPDATE `queue` SET `finished`=UNIX_TIMESTAMP() WHERE `id`='+db.escape_string(str(self.query['id'])))