Exemple #1
0
 def parse(self, browser):
     #data = re.findall('<a((([^=>]+)(=(([\'"]([^\'">]*)[\'"])|([^\'"\s\n>]*))*))*)>', page, re.S)
     data = re.findall('href=([\'"][^\'"]+[\'"]|[^\n\s>]*)', browser.page, re.S)
     if data:
         for path in data:
             i = data.index(path)
             if data.count(data[i]) > 1:
                 data[i] = ''
             elif data[i].strip() != '':
                 data[i] = re.sub(r'^[\'"](.*)[\'"]$',r'\1',data[i],re.S).strip()
                 mail = re.search('mailto:(?P<email>[[email protected]_\-\.]*)',data[i].strip(),re.S)
                 if mail:
                     print 'mailto founded' # debugging
                     if mail.group('email'):
                         db.query('INSERT into `emails` (`time`,`url`,`email`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(browser.get_url())+'","'+db.escape_string(mail.group('email'))+'")')
                         print 'email inserted' # debugging
                 else:
                     if not re.search('^http://',data[i]):
                         if len(data[i]) > 0 and data[i][0] != '/':
                             data[i] = '/'+data[i]
                         data[i] = 'http://'+browser.host+data[i]
                     if data.count(data[i]) == 1:
                         db.query('SELECT `id` from `queue` WHERE `url`="'+db.escape_string(data[i])+'" AND `created`>UNIX_TIMESTAMP()-1209600')
                         if not db.store_result().num_rows():
                             db.query('INSERT into `queue` (`created`,`url`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(data[i])+'")')
                     elif data.count(data[i]) > 1:
                         data.remove(data[i])
Exemple #2
0
 def get(self, id='', host=''):
     if id or host:
         if id:
             db.query('SELECT `id`,`host` FROM `hosts` WHERE `id`='+db.escape_string(str(id)))
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host = res.fetch_row()
             elif host:
                 self.host = host
         elif host:
             host = re.sub('^www.', '', host)
             db.query('SELECT `id`,`host` FROM `hosts` WHERE `host`="'+db.escape_string(host)+'"')
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host = res.fetch_row()[0]
             else:
                 self.host = host
Exemple #3
0
 def next(self):
     db.query('SELECT * from `queue` WHERE `finished`=0 ORDER by `id` ASC LIMIT 1')
     res = db.store_result()
     if res.num_rows() == 1:
         self.query = res.fetch_row(how=1)[0]
         return self.query
     else:
         return False
Exemple #4
0
 def get(self, id='', host_id='', path=''):
     if id or path:
         if id:
             db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `id`='+db.escape_string(str(id)))
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host_id, self.path = res.fetch_row()[0]
             elif host_id and path:
                 self.host_id = host_id
                 self.path = path
         elif host_id and path:
             db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `host_id`="'+db.escape_string(str(host_id))+'" AND `path`="'+db.escape_string(path)+'"')
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host_id, self.path = res.fetch_row()[0]
             else:
                 self.host_id = host_id
                 self.path = path
Exemple #5
0
 def get(self, id='', page_id='', status=''):
     if id:
         db.query('SELECT `id`,`page_id`,`status` FROM `visits` WHERE `id`='+db.escape_string(str(id)))
         res = db.store_result()
         if res.num_rows() == 1:
             self.id, self.page_id, self.status = res.fetch_row()[0]
     elif page_id and status:
         self.page_id = page_id
         self.status = status
Exemple #6
0
 def get(self, visit_id='', errors=''):
     if visit_id and not errors:
         self.visit_id = visit_id
         db.query('SELECT `id`,`visit_id`,`error_id` FROM `page_errors` WHERE `visit_id`='+db.escape_string(str(visit_id)))
         res = db.store_result()
         if res.num_rows():
             while True:
                 row = res.fetch_row()[0]
                 if not row:
                     break
                 self.errors = self.errors + [res.fetch_row()[0][2]]
     elif visit_id and errors:
         self.visit_id = visit_id
         self.errors = errors
Exemple #7
0
 def get(self, id="", visit_id="", page_id="", browser=""):
     if id:
         db.query(
             "SELECT `id`,`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content` FROM `cache` WHERE `id`="
             + db.escape_string(str(id))
         )
         res = db.store_result()
         if res.num_rows() == 1:
             self.id, self.visit_id, self.page_id, self.encoding, self.doctype, self.validity, self.content = res.fetch_row()[
                 0
             ]
     elif visit_id and page_id and browser:
         self.visit_id = visit_id
         self.page_id = page_id
         self.headers = browser.headers
         if browser.validate:
             self.encoding = browser.validator.charset
             self.doctype = browser.validator.doctype
             if browser.validator.validity:
                 self.validity = int(browser.validator.validity)
         self.content = browser.page