def parse(self, response, request): base_url = response.url bs = BeautifulSoup(response.text) href_tags = {"a", "link", "area"} src_tags = { "form", "script", "img", "iframe", "frame", "embed", "source", "track" } param_names = {"movie", "href", "link", "src", "url", "uri"} for tag in bs.findAll(): name = tag.name.lower() url = None if name in href_tags: url = tag.get("href", None) elif name in src_tags: url = tag.get("src", None) elif name == "param": name = tag.get("name", "").lower().strip() if name in param_names: url = tag.get("value", None) elif name == "object": url = tag.get("data", None) elif name == "applet": url = tag.get("code", None) elif name == "meta": name = tag.get("name", "").lower().strip() if name == "http-equiv": content = tag.get("content", "") p = content.find(";") if p >= 0: url = content[p + 1:] elif name == "base": url = tag.get("href", None) try: base_url = urljoin(base_url, url.strip(), allow_fragments=False) except Exception: continue if url is not None: try: url = urljoin(base_url, url.strip()) except Exception: continue if self.schedule.isOrigin(url) and url not in self.urlSet: self.urlSet.add(url) self.schedule.addRequest(Request.fromUrl(url, base_url)) #add directory paths = url.replace(self.domain, '').split('/') paths = ['/'.join(paths[:x]) for x in range(1, len(paths))] for p in paths: u = "%s%s/" % (self.domain, p) if u not in self.urlSet: self.urlSet.add(u) self.schedule.addRequest(Request.fromUrl(u, url)) #handle form if name == "form": self.parseForm(tag, base_url)
def parse(self,response,request): base_url = response.url bs = BeautifulSoup(response.text) href_tags = {"a", "link", "area"} src_tags = {"form", "script", "img", "iframe", "frame", "embed", "source", "track"} param_names = {"movie", "href", "link", "src", "url", "uri"} for tag in bs.findAll(): name = tag.name.lower() url = None if name in href_tags: url = tag.get("href", None) elif name in src_tags: url = tag.get("src", None) elif name == "param": name = tag.get("name", "").lower().strip() if name in param_names: url = tag.get("value", None) elif name == "object": url = tag.get("data", None) elif name == "applet": url = tag.get("code", None) elif name == "meta": name = tag.get("name", "").lower().strip() if name == "http-equiv": content = tag.get("content", "") p = content.find(";") if p >= 0: url = content[ p + 1 : ] elif name == "base": url = tag.get("href", None) try: base_url = urljoin(base_url, url.strip(), allow_fragments = False) except Exception: continue if url is not None: try: url = urljoin(base_url, url.strip()) except Exception: continue if self.schedule.isOrigin(url) and url not in self.urlSet: self.urlSet.add(url) self.schedule.addRequest(Request.fromUrl(url,base_url)) #add directory paths = url.replace(self.domain,'').split('/') paths = ['/'.join(paths[:x]) for x in range(1,len(paths))] for p in paths: u = "%s%s/" % (self.domain,p) if u not in self.urlSet: self.urlSet.add(u) self.schedule.addRequest(Request.fromUrl(u,url)) #handle form if name == "form": self.parseForm(tag,base_url)
def parseForm(self,form,base_url): action = form.get('action','') method = form.get('method',DEFAULT_METHOD).upper() url = urljoin(base_url,action) if not self.schedule.isOrigin(url): return input = {} #Process <input type="test" name="... for m in form.findAll('input',{'name' : True,'type' : 'text'}): value = m.get('value','') input[m['name']] = value #Process <input type="password" name="... for m in form.findAll('input',{'name' : True,'type' : 'password'}): value = m.get('value','') input[m['name']] = value #Process <input type="submit" name="... for m in form.findAll('input',{'name' : True,'type' : 'submit'}): value = m.get('value','') input[m['name']] = value #Process <input type="hidden" name="... for m in form.findAll('input',{'name' : True,'type' : 'hidden'}): value = m.get('value','') input[m['name']] = value #Process <input type="checkbox" name="... for m in form.findAll('input',{'name' : True,'type' : 'checkbox'}): value = m.get('value','') input[m['name']] = value #Process <input type="radio" name="... listRadio = [] for m in form.findAll('input',{'name' : True,'type' : 'radio'}): if not m['name'] in listRadio: listRadio.append(m['name']) value = m.get('value','') input[m['name']] = value #Process <textarea name="... for m in form.findAll('textarea',{'name' : True}): input[m['name']] = m.contents[0] #Process <select name="... for m in form.findAll('select',{'name' : True}): if len(m.findAll('option',value=True))>0: name = m['name'] input[name] = m.findAll('option',value=True)[0]['value'] params = '&'.join(["%s=%s" %(k,v) for k,v in input.iteritems()]) request = Request(url, method, params ,base_url) self.schedule.addRequest(request)
def parseForm(self, form, base_url): action = form.get('action', '') method = form.get('method', DEFAULT_METHOD).upper() url = urljoin(base_url, action) if not self.schedule.isOrigin(url): return input = {} #Process <input type="test" name="... for m in form.findAll('input', {'name': True, 'type': 'text'}): value = m.get('value', '') input[m['name']] = value #Process <input type="password" name="... for m in form.findAll('input', {'name': True, 'type': 'password'}): value = m.get('value', '') input[m['name']] = value #Process <input type="submit" name="... for m in form.findAll('input', {'name': True, 'type': 'submit'}): value = m.get('value', '') input[m['name']] = value #Process <input type="hidden" name="... for m in form.findAll('input', {'name': True, 'type': 'hidden'}): value = m.get('value', '') input[m['name']] = value #Process <input type="checkbox" name="... for m in form.findAll('input', {'name': True, 'type': 'checkbox'}): value = m.get('value', '') input[m['name']] = value #Process <input type="radio" name="... listRadio = [] for m in form.findAll('input', {'name': True, 'type': 'radio'}): if not m['name'] in listRadio: listRadio.append(m['name']) value = m.get('value', '') input[m['name']] = value #Process <textarea name="... for m in form.findAll('textarea', {'name': True}): input[m['name']] = m.contents[0] #Process <select name="... for m in form.findAll('select', {'name': True}): if len(m.findAll('option', value=True)) > 0: name = m['name'] input[name] = m.findAll('option', value=True)[0]['value'] params = '&'.join(["%s=%s" % (k, v) for k, v in input.iteritems()]) request = Request(url, method, params, base_url) self.schedule.addRequest(request)
def parseRobots(self): """ parse robots protocol,both allow and disallow entry for example: http://www.gaoloumi.com/robots.txt """ self.task.update_robots_flag('start') robotsUrl = self.domain + "robots.txt" try: response = requests.request(robotsUrl) if not response: return lines = response.text.splitlines() for line in lines: match = ROBOTS_ALLOW_PATH.search(line) path = match.group('path') if match else '/' if path != '/': url = urljoin(self.domain,path) self.addRequest(Request.fromUrl(url,robotsUrl)) finally: self.task.update_robots_flag('finish')
def parseRobots(self): """ parse robots protocol,both allow and disallow entry for example: http://www.gaoloumi.com/robots.txt """ self.task.update_robots_flag('start') robotsUrl = self.domain + "robots.txt" try: response = requests.request(robotsUrl) if not response: return lines = response.text.splitlines() for line in lines: match = ROBOTS_ALLOW_PATH.search(line) path = match.group('path') if match else '/' if path != '/': url = urljoin(self.domain, path) self.addRequest(Request.fromUrl(url, robotsUrl)) finally: self.task.update_robots_flag('finish')