コード例 #1
0
    def __init__(self,
                 url,
                 headers={},
                 threads=10,
                 timeout=60,
                 sleep=2,
                 proxy={},
                 level=False,
                 cert=None):
        threading.Thread.__init__(self)
        self.settings = {}
        self.settings['threads'] = int(threads)
        self.settings['timeout'] = int(timeout)
        self.settings['sleep'] = int(sleep)
        self.settings['proxy'] = proxy
        self.settings['level'] = level
        self.settings['headers'] = headers
        self.session = Session()
        self.block = []  #set()
        self.cert = cert
        self.url = url
        req = BaseRequest(self.url,
                          proxy=self.settings['proxy'],
                          session=self.session)
        res = req.response()
        self.basereq = req
        self.basereq.url = res.url
        self.website = BaseWebSite(self.basereq.url,
                                   proxy=self.settings['proxy'],
                                   session=self.session)

        self.ISSTART = True
        self.ReqQueue = queue.Queue()
        self.ResQueue = queue.Queue()
        self.Directory = {}  #目录结构
        self.SubDomain = set()  #子域名列表
        self.Page20x = set()
        self.Page30x = set()
        self.Page40x = set()
        self.Page50x = set()
コード例 #2
0
 def run(self):
     pool = ThreadPool(self.settings['threads'])
     self.FLAG = self.settings['timeout']
     try:
         self.request(BaseRequest(self.basereq.url,headers=self.settings['headers'],session=self.session,proxy=self.settings['proxy']))
     except Exception as e:
         print('err',e)
         self.ISSTART = False
         return
     #5分钟后还没有任务加进来就当爬完了
     while self.ISSTART and self.FLAG > 0:
         #logging.load('Reload ... Wait for %s'%self.FLAG)
         try:
             req = self.ReqQueue.get(block=False)
             pool.spawn(self.request,req)
         except queue.Empty:
             time.sleep(1)
             self.FLAG -= 1
     self.ISSTART = False
     pool.join()
コード例 #3
0
    def parse(self, response):
        content_type = response.headers.get('content-type', 'text')
        if content_type not in ("image", "octet-stream"):
            response = response.text
            urls = set()
            urls = urls.union(
                set(re.findall(r"""src[\s]*:[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""src[\s]*=[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""href[\s]*:[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""href[\s]*=[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""url[\s]*:[\s]*['"](.*?)['"]""", response)))
            urls = urls.union(
                set(re.findall(r"""url[\s]*=[\s]*['"](.*?)['"]""", response)))
            urls = urls.union(
                set(
                    re.findall(
                        r'''['"](/[^/\*'"][A-Za-z0-9\.\\/_-]{1,255})['"]''',
                        response)))
            urls = urls.union(
                set(
                    re.findall(
                        r"""['"]([A-Za-z0-9\.\\/_-]{1,255}[a-zA-Z]\?[a-zA-Z].*?)['"]""",
                        response)))
            urls = urls.union(
                set(
                    re.findall(
                        """(http[s]?://(?:[-a-zA-Z0-9_]+\.)+[a-zA-Z]+(?::\d+)?(?:/[-a-zA-Z0-9_%./]+)*\??[-a-zA-Z0-9_&%=.]*)""",
                        response)))
            for url in urls:
                if url:
                    req = BaseRequest(self.urljoin(url),
                                      session=self.session,
                                      proxy=self.settings['proxy'])
                    self.addreq(req)

            if self.settings['level']:
                posts = []
                for f in re.findall(r"""<form([\s\S]*?)</form>""", response):
                    post = {}
                    post['action'] = ''.join(
                        re.findall(r"""action[\s]*=[\s]*["'](.*?)["']""",
                                   f)) or './'
                    post['method'] = ''.join(
                        re.findall(r"""method[\s]*=[\s]*["'](.*?)["']""",
                                   f)) or 'POST'
                    post['data'] = {}
                    for d in re.findall(r"""<input[\s\S]*?>""", f):
                        name = ''.join(
                            re.findall(r"""name[\s]*=[\s]*["'](.*?)["']""", d))
                        value = ''.join(
                            re.findall(r"""value[\s]*=[\s]*["'](.*?)["']""",
                                       d))
                        if not value: value = name
                        post['data'].update({name: value})
                    posts.append(post)
                for post in posts:
                    req = BaseRequest(self.urljoin(post['action']),
                                      method=post['method'],
                                      data=post['data'],
                                      session=self.session,
                                      proxy=self.settings['proxy'])
                    self.addreq(req)
コード例 #4
0
    def parse(self, response):
        content_type = response.headers.get('content-type', 'text')
        if 'text' in content_type or 'javascript' in content_type:
            response = response.text
            urls = set()
            #urls = urls.union(set(re.findall(r"""[href|src][\s]*[:=]["'\s]*(.*?)["'\s>]""",response)))

            urls = urls.union(
                set(re.findall(r"""src=([^'"].*?[^'"])[>\s]""", response)))
            urls = urls.union(
                set(re.findall(r"""href=([^'"].*?[^'"])[>\s]""", response)))
            urls = urls.union(
                set(re.findall(r"""src[\s]*:[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""src[\s]*=[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""href[\s]*:[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""href[\s]*=[\s]*["'](.*?)["']""", response)))
            urls = urls.union(
                set(re.findall(r"""url[\s]*:[\s]*['"](.*?)['"]""", response)))
            urls = urls.union(
                set(re.findall(r"""url[\s]*=[\s]*['"](.*?)['"]""", response)))
            urls = urls.union(
                set(
                    re.findall(
                        r'''['"](/[^/\*'"][A-Za-z0-9\.\\/_-]{1,255})['"]''',
                        response)))
            urls = urls.union(
                set(
                    re.findall(
                        r"""['"]([A-Za-z0-9\.\\/_-]{1,255}[a-zA-Z]\?[a-zA-Z].*?)['"]""",
                        response)))
            urls = urls.union(
                set(
                    re.findall(
                        """(http[s]?://(?:[-a-zA-Z0-9_]+\.)+[a-zA-Z]+(?::\d+)?(?:/[-a-zA-Z0-9_%./]+)*\??[-a-zA-Z0-9_&%=.]*)""",
                        response)))
            for url in urls:
                url = self.urljoin(url)
                if url:
                    req = BaseRequest(url,
                                      session=self.session,
                                      proxy=self.settings['proxy'])
                    self.addreq(req)

            if self.settings['level']:
                posts = []
                for k, v in re.findall(
                        r"""<form([\s\S]*?>)([\s\S]*?)</form>""", response):
                    post = {}
                    post['action'] = ''.join(
                        re.findall(r"""action[\s]*=["'\s]*(.*?)["'\s>]""",
                                   k)) or './'
                    post['method'] = ''.join(
                        re.findall(r"""method[\s]*=["'\s]*(.*?)["'\s>]""",
                                   k)) or 'POST'
                    post['data'] = {}
                    for d in re.findall(r"""<input([\s\S]*?)>""", v):
                        name = ''.join(
                            re.findall(r"""name[\s]*=["'\s]*(.*?)["'\s>]""",
                                       d))
                        value = ''.join(
                            re.findall(r"""value[\s]*=["'\s]*(.*?)["'\s>]""",
                                       d))
                        if not value: value = name
                        post['data'].update({name: value})
                    posts.append(post)
                for post in posts:
                    #print(post)
                    req = BaseRequest(self.urljoin(post['action']),
                                      method=post['method'],
                                      data=post['data'],
                                      session=self.session,
                                      proxy=self.settings['proxy'])
                    self.addreq(req)