Example #1
0
    def parse(self, response, request):
        base_url = response.url
        bs = BeautifulSoup(response.text)
        href_tags = {"a", "link", "area"}
        src_tags = {
            "form", "script", "img", "iframe", "frame", "embed", "source",
            "track"
        }
        param_names = {"movie", "href", "link", "src", "url", "uri"}

        for tag in bs.findAll():
            name = tag.name.lower()
            url = None
            if name in href_tags:
                url = tag.get("href", None)
            elif name in src_tags:
                url = tag.get("src", None)
            elif name == "param":
                name = tag.get("name", "").lower().strip()
                if name in param_names:
                    url = tag.get("value", None)
            elif name == "object":
                url = tag.get("data", None)
            elif name == "applet":
                url = tag.get("code", None)
            elif name == "meta":
                name = tag.get("name", "").lower().strip()
                if name == "http-equiv":
                    content = tag.get("content", "")
                    p = content.find(";")
                    if p >= 0:
                        url = content[p + 1:]
            elif name == "base":
                url = tag.get("href", None)
                try:
                    base_url = urljoin(base_url,
                                       url.strip(),
                                       allow_fragments=False)
                except Exception:
                    continue
            if url is not None:
                try:
                    url = urljoin(base_url, url.strip())
                except Exception:
                    continue
                if self.schedule.isOrigin(url) and url not in self.urlSet:
                    self.urlSet.add(url)
                    self.schedule.addRequest(Request.fromUrl(url, base_url))

                    #add directory
                    paths = url.replace(self.domain, '').split('/')
                    paths = ['/'.join(paths[:x]) for x in range(1, len(paths))]
                    for p in paths:
                        u = "%s%s/" % (self.domain, p)
                        if u not in self.urlSet:
                            self.urlSet.add(u)
                            self.schedule.addRequest(Request.fromUrl(u, url))
            #handle form
            if name == "form":
                self.parseForm(tag, base_url)
Example #2
0
    def parse(self,response,request):
        base_url = response.url
        bs = BeautifulSoup(response.text)
        href_tags = {"a", "link", "area"}
        src_tags = {"form", "script", "img", "iframe", "frame", "embed", "source", "track"}
        param_names = {"movie", "href", "link", "src", "url", "uri"}

        for tag in bs.findAll():
            name = tag.name.lower()
            url = None
            if name in href_tags:
                url = tag.get("href", None)
            elif name in src_tags:
                url = tag.get("src", None)
            elif name == "param":
                name = tag.get("name", "").lower().strip()
                if name in param_names:
                    url = tag.get("value", None)
            elif name == "object":
                url = tag.get("data", None)
            elif name == "applet":
                url = tag.get("code", None)
            elif name == "meta":
                name = tag.get("name", "").lower().strip()
                if name == "http-equiv":
                    content = tag.get("content", "")
                    p = content.find(";")
                    if p >= 0:
                        url = content[ p + 1 : ]
            elif name == "base":
                url = tag.get("href", None)
                try:
                    base_url = urljoin(base_url, url.strip(), allow_fragments = False)
                except Exception:
                    continue
            if url is not None:
                try:
                    url = urljoin(base_url, url.strip())
                except Exception:
                    continue
                if self.schedule.isOrigin(url) and url not in self.urlSet:
                    self.urlSet.add(url)
                    self.schedule.addRequest(Request.fromUrl(url,base_url))

                    #add directory
                    paths = url.replace(self.domain,'').split('/')
                    paths =  ['/'.join(paths[:x]) for x in range(1,len(paths))]
                    for p in paths:
                        u = "%s%s/" % (self.domain,p)
                        if u not in self.urlSet:
                            self.urlSet.add(u)
                            self.schedule.addRequest(Request.fromUrl(u,url))
            #handle form 
            if name == "form":
                self.parseForm(tag,base_url)
Example #3
0
 def parseForm(self,form,base_url):
     action = form.get('action','')
     method = form.get('method',DEFAULT_METHOD).upper()
     url = urljoin(base_url,action)
     if not self.schedule.isOrigin(url):
         return
     input = {}
     #Process <input type="test" name="...
     for m in form.findAll('input',{'name' : True,'type' : 'text'}):
         value = m.get('value','')
         input[m['name']] = value
     #Process <input type="password" name="...
     for m in form.findAll('input',{'name' : True,'type' : 'password'}):
         value = m.get('value','')
         input[m['name']] = value
     #Process <input type="submit" name="...
     for m in form.findAll('input',{'name' : True,'type' : 'submit'}):
         value = m.get('value','')
         input[m['name']] = value
     #Process <input type="hidden" name="...
     for m in form.findAll('input',{'name' : True,'type' : 'hidden'}):
         value = m.get('value','')
         input[m['name']] = value
     #Process <input type="checkbox" name="...
     for m in form.findAll('input',{'name' : True,'type' : 'checkbox'}):
         value = m.get('value','')
         input[m['name']] = value
     #Process <input type="radio" name="...
     listRadio = []
     for m in form.findAll('input',{'name' : True,'type' : 'radio'}):
         if not m['name'] in listRadio:
             listRadio.append(m['name'])
             value = m.get('value','')
             input[m['name']] = value
     #Process <textarea name="...
     for m in form.findAll('textarea',{'name' : True}):
         input[m['name']] = m.contents[0]
     #Process <select name="...
     for m in form.findAll('select',{'name' : True}):
         if len(m.findAll('option',value=True))>0:
             name = m['name']
             input[name] = m.findAll('option',value=True)[0]['value']
     params = '&'.join(["%s=%s" %(k,v) for k,v in input.iteritems()])
     request = Request(url, method, params ,base_url)
     self.schedule.addRequest(request)
Example #4
0
 def parseForm(self, form, base_url):
     action = form.get('action', '')
     method = form.get('method', DEFAULT_METHOD).upper()
     url = urljoin(base_url, action)
     if not self.schedule.isOrigin(url):
         return
     input = {}
     #Process <input type="test" name="...
     for m in form.findAll('input', {'name': True, 'type': 'text'}):
         value = m.get('value', '')
         input[m['name']] = value
     #Process <input type="password" name="...
     for m in form.findAll('input', {'name': True, 'type': 'password'}):
         value = m.get('value', '')
         input[m['name']] = value
     #Process <input type="submit" name="...
     for m in form.findAll('input', {'name': True, 'type': 'submit'}):
         value = m.get('value', '')
         input[m['name']] = value
     #Process <input type="hidden" name="...
     for m in form.findAll('input', {'name': True, 'type': 'hidden'}):
         value = m.get('value', '')
         input[m['name']] = value
     #Process <input type="checkbox" name="...
     for m in form.findAll('input', {'name': True, 'type': 'checkbox'}):
         value = m.get('value', '')
         input[m['name']] = value
     #Process <input type="radio" name="...
     listRadio = []
     for m in form.findAll('input', {'name': True, 'type': 'radio'}):
         if not m['name'] in listRadio:
             listRadio.append(m['name'])
             value = m.get('value', '')
             input[m['name']] = value
     #Process <textarea name="...
     for m in form.findAll('textarea', {'name': True}):
         input[m['name']] = m.contents[0]
     #Process <select name="...
     for m in form.findAll('select', {'name': True}):
         if len(m.findAll('option', value=True)) > 0:
             name = m['name']
             input[name] = m.findAll('option', value=True)[0]['value']
     params = '&'.join(["%s=%s" % (k, v) for k, v in input.iteritems()])
     request = Request(url, method, params, base_url)
     self.schedule.addRequest(request)
Example #5
0
 def parseRobots(self):
     """
     parse robots protocol,both allow and disallow entry
     for example: http://www.gaoloumi.com/robots.txt
     """
     self.task.update_robots_flag('start')
     robotsUrl = self.domain + "robots.txt"
     try:
         response = requests.request(robotsUrl)
         if not response:
             return
         lines = response.text.splitlines() 
         for line in lines:
             match = ROBOTS_ALLOW_PATH.search(line)
             path = match.group('path') if match else '/'
             if path != '/':
                 url = urljoin(self.domain,path)
                 self.addRequest(Request.fromUrl(url,robotsUrl))
     finally:
         self.task.update_robots_flag('finish')
Example #6
0
 def parseRobots(self):
     """
     parse robots protocol,both allow and disallow entry
     for example: http://www.gaoloumi.com/robots.txt
     """
     self.task.update_robots_flag('start')
     robotsUrl = self.domain + "robots.txt"
     try:
         response = requests.request(robotsUrl)
         if not response:
             return
         lines = response.text.splitlines()
         for line in lines:
             match = ROBOTS_ALLOW_PATH.search(line)
             path = match.group('path') if match else '/'
             if path != '/':
                 url = urljoin(self.domain, path)
                 self.addRequest(Request.fromUrl(url, robotsUrl))
     finally:
         self.task.update_robots_flag('finish')