Ejemplo n.º 1
0
    def parse_item(self, response):
        page = response.meta['page']

        print utils.get_time_now(), "Target ==> " + (response.url)

        if page > MAX_PAGE_INDEX_NS:
            return

        #是否停止爬行
        isBreak = False

        itemList = []
        trs = response.xpath("//table[@class='table']/tbody/tr")
        if trs:
            for sel in trs:
                item = Item(
                    sel.xpath('td[1]/text()').extract()[0],
                    sel.xpath('td[2]/text()').extract()[0],
                    sel.xpath('td[4]/text()').extract()[0],
                    sel.xpath('td[5]/text()').extract()[0],
                    sel.xpath('td[3]//text()').extract()[0], '0',
                    sel.xpath('td[8]/text()').extract()[0])

                itemList.append(item)
            #排序
            try:
                import operator
            except ImportError:
                cmpfun = lambda x: x.count  # use a lambda if no operator module
            else:
                cmpfun = operator.attrgetter(
                    "last_verify_time"
                )  # use operator since it's faster than lambda

            itemList.sort(key=cmpfun, reverse=True)

            for item in itemList:
                if not self.daysDelta(item.last_verify_time):
                    isBreak = True
                    break
                else:
                    item_ = IPItem()
                    item_['ip'] = item.ip
                    item_['port'] = item.port
                    item_['anonymous'] = item.anonymous
                    item_['http_type'] = item.http_type
                    item_['location'] = item.location
                    item_['latency'] = item.latency
                    item_['last_verify_time'] = item.last_verify_time
                    item_['source'] = self.allowed_domains[0]

                    yield item_
        else:
            isBreak = True

        if not isBreak:
            yield Request(self.start_urls[0] + '?page=' + str(page + 1),
                          callback=self.parse_item,
                          meta={'page': page + 1})
Ejemplo n.º 2
0
    def parse_item(self, response):
        page = response.meta['page']
        url = response.meta['url']
        print utils.get_time_now(), "Target ==> " + (response.url)
        
#         if page > MAX_PAGE_INDEX_KDL:
#             return 
        
        #是否停止爬行
        isBreak = False
        
        itemList = []
        trs = response.xpath("//table[@id='ip_list']/tr")[1:]
        if trs:
            for sel in trs:
                item = Item(sel.xpath('td[3]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], sel.xpath('td[6]/text()').extract()[0], 
                            sel.xpath('td[7]/text()').extract()[0], 
                            "".join(sel.xpath('td[5]//a/text()').extract() + sel.xpath('td[5]/text()').extract()).strip(), 
                            sel.xpath('td[9]/div/@title').extract()[0], 
                            sel.xpath('td[10]/text()').extract()[0])
                
#                 print item.to_string()
                itemList.append(item)
                
            #排序
            try: 
                import operator
            except ImportError: 
                cmpfun= lambda x: x.count # use a lambda if no operator module
            else: 
                cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda
               
            itemList.sort(key=cmpfun, reverse=True)
              
            for item in itemList:
                if not self.daysDelta(item.last_verify_time):
                    isBreak = True
                    break
                else:    
                    item_ = IPItem()
                    item_['ip'] = item.ip
                    item_['port'] = item.port
                    item_['anonymous'] = item.anonymous
                    item_['http_type'] = item.http_type
                    item_['location'] = item.location
                    item_['latency'] = item.latency
                    item_['last_verify_time'] = datetime.datetime.strptime(item.last_verify_time, '%y-%m-%d %H:%M')
                    item_['source'] = url
       
                    yield item_
        else:
            isBreak = True
                  
        if not isBreak:
            yield Request(url + str(page+1), callback=self.parse_item, meta={'page': page+1, 'url':url})
Ejemplo n.º 3
0
 def parse_item(self, response):
     print utils.get_time_now(), "Target ==> " + (response.url)
     js = PyV8.JSContext()
     js.enter()
     
     main_js = response.xpath("//body/script[1]/text()").extract()
     js.eval(main_js[0])
     
     itemList = []
     trs = response.xpath("//table[2]/tr[4]/td/table/tr") 
     
     if trs: 
         for tr in trs[4:-1]:
             ip_port = tr.xpath("td[1]/font[2]")
             ip = ip_port.xpath("text()").extract()
             port = ip_port.xpath("script/text()").extract()[0]
             port = port.split('<\/font>"+')[1]
              
             port_list = port[:-1].split("+")
             port = ""
             for val in port_list:
                 port = port + str(js.eval(val))
          
             item = Item(ip[0], port, tr.xpath("td[3]/font/text()").extract()[0], 
                         "".join(tr.xpath("td[2]/a/font[@class='spy1']/text()").extract() + tr.xpath("td[2]/a/font[@class='spy14']/text()").extract()), 
                         "".join(tr.xpath("td[4]/font/text()").extract() + tr.xpath("td[4]/font/font/text()").extract()), 
                         tr.xpath("td[6]/font/text()").extract()[0], tr.xpath("td[9]/font/font[@class='spy14']/text()").extract()[0] + tr.xpath("td[9]/font/text()").extract()[0])
              
             itemList.append(item)
         #排序
         try: 
             import operator
         except ImportError: 
             cmpfun= lambda x: x.count # use a lambda if no operator module
         else: 
             cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda
           
         itemList.sort(key=cmpfun, reverse=True)
          
         for item in itemList:
             if not self.daysDelta(str(item.last_verify_time)):
                 break
             else:    
                 item_ = IPItem()
                 item_['ip'] = item.ip
                 item_['port'] = item.port
                 item_['anonymous'] = item.anonymous
                 item_['http_type'] = item.http_type
                 item_['location'] = item.location
                 item_['latency'] = item.latency
                 item_['last_verify_time'] = datetime.datetime.strptime(item.last_verify_time[:-1] + ':00', '%d-%b-%Y %H:%M:%S')
                 item_['source'] = self.allowed_domains[0]
                 
                 yield item_ 
Ejemplo n.º 4
0
 def parse_item(self, response):
     page = response.meta['page']
     
     print utils.get_time_now(), "Target ==> " + (response.url)
     
     if page > MAX_PAGE_INDEX_NS:
         return 
     
     #是否停止爬行
     isBreak = False
     
     itemList = []
     trs = response.xpath("//table[@class='table']/tbody/tr")       
     if trs: 
         for sel in trs:
             item = Item(sel.xpath('td[1]/text()').extract()[0], sel.xpath('td[2]/text()').extract()[0], sel.xpath('td[4]/text()').extract()[0], 
                         sel.xpath('td[5]/text()').extract()[0], sel.xpath('td[3]//text()').extract()[0], '0', 
                         sel.xpath('td[8]/text()').extract()[0])
              
             itemList.append(item)
         #排序
         try: 
             import operator
         except ImportError: 
             cmpfun= lambda x: x.count # use a lambda if no operator module
         else: 
             cmpfun= operator.attrgetter("last_verify_time") # use operator since it's faster than lambda
          
         itemList.sort(key=cmpfun, reverse=True)
         
         for item in itemList:
             if not self.daysDelta(item.last_verify_time):
                 isBreak = True
                 break
             else:    
                 item_ = IPItem()
                 item_['ip'] = item.ip
                 item_['port'] = item.port
                 item_['anonymous'] = item.anonymous
                 item_['http_type'] = item.http_type
                 item_['location'] = item.location
                 item_['latency'] = item.latency
                 item_['last_verify_time'] = item.last_verify_time
                 item_['source'] = self.allowed_domains[0]
   
                 yield item_
     else:
         isBreak = True
             
     if not isBreak:
         yield Request(self.start_urls[0] + '?page=' + str(page+1), callback=self.parse_item, meta={'page': page+1})        
Ejemplo n.º 5
0
    def parse_item(self, response):
        page = response.meta['page']
        url = response.meta['url']
        print utils.get_time_now(), "Target ==> " + (response.url)

        #         if page > MAX_PAGE_INDEX_KDL:
        #             return

        #是否停止爬行
        isBreak = False

        itemList = []
        trs = response.xpath("//table[@id='ip_list']/tr")[1:]
        if trs:
            for sel in trs:
                item = Item(
                    sel.xpath('td[3]/text()').extract()[0],
                    sel.xpath('td[4]/text()').extract()[0],
                    sel.xpath('td[6]/text()').extract()[0],
                    sel.xpath('td[7]/text()').extract()[0], "".join(
                        sel.xpath('td[5]//a/text()').extract() +
                        sel.xpath('td[5]/text()').extract()).strip(),
                    sel.xpath('td[9]/div/@title').extract()[0],
                    sel.xpath('td[10]/text()').extract()[0])

                #                 print item.to_string()
                itemList.append(item)

            #排序
            try:
                import operator
            except ImportError:
                cmpfun = lambda x: x.count  # use a lambda if no operator module
            else:
                cmpfun = operator.attrgetter(
                    "last_verify_time"
                )  # use operator since it's faster than lambda

            itemList.sort(key=cmpfun, reverse=True)

            for item in itemList:
                if not self.daysDelta(item.last_verify_time):
                    isBreak = True
                    break
                else:
                    item_ = IPItem()
                    item_['ip'] = item.ip
                    item_['port'] = item.port
                    item_['anonymous'] = item.anonymous
                    item_['http_type'] = item.http_type
                    item_['location'] = item.location
                    item_['latency'] = item.latency
                    item_['last_verify_time'] = datetime.datetime.strptime(
                        item.last_verify_time, '%y-%m-%d %H:%M')
                    item_['source'] = url

                    yield item_
        else:
            isBreak = True

        if not isBreak:
            yield Request(url + str(page + 1),
                          callback=self.parse_item,
                          meta={
                              'page': page + 1,
                              'url': url
                          })
Ejemplo n.º 6
0
    def parse_item(self, response):
        print utils.get_time_now(), "Target ==> " + (response.url)
        js = PyV8.JSContext()
        js.enter()

        main_js = response.xpath("//body/script[1]/text()").extract()
        js.eval(main_js[0])

        itemList = []
        trs = response.xpath("//table[2]/tr[4]/td/table/tr")

        if trs:
            for tr in trs[4:-1]:
                ip_port = tr.xpath("td[1]/font[2]")
                ip = ip_port.xpath("text()").extract()
                port = ip_port.xpath("script/text()").extract()[0]
                port = port.split('<\/font>"+')[1]

                port_list = port[:-1].split("+")
                port = ""
                for val in port_list:
                    port = port + str(js.eval(val))

                item = Item(
                    ip[0], port,
                    tr.xpath("td[3]/font/text()").extract()[0], "".join(
                        tr.xpath(
                            "td[2]/a/font[@class='spy1']/text()").extract() +
                        tr.xpath(
                            "td[2]/a/font[@class='spy14']/text()").extract()),
                    "".join(
                        tr.xpath("td[4]/font/text()").extract() +
                        tr.xpath("td[4]/font/font/text()").extract()),
                    tr.xpath("td[6]/font/text()").extract()[0],
                    tr.xpath("td[9]/font/font[@class='spy14']/text()").extract(
                    )[0] + tr.xpath("td[9]/font/text()").extract()[0])

                itemList.append(item)
            #排序
            try:
                import operator
            except ImportError:
                cmpfun = lambda x: x.count  # use a lambda if no operator module
            else:
                cmpfun = operator.attrgetter(
                    "last_verify_time"
                )  # use operator since it's faster than lambda

            itemList.sort(key=cmpfun, reverse=True)

            for item in itemList:
                if not self.daysDelta(str(item.last_verify_time)):
                    break
                else:
                    item_ = IPItem()
                    item_['ip'] = item.ip
                    item_['port'] = item.port
                    item_['anonymous'] = item.anonymous
                    item_['http_type'] = item.http_type
                    item_['location'] = item.location
                    item_['latency'] = item.latency
                    item_['last_verify_time'] = datetime.datetime.strptime(
                        item.last_verify_time[:-1] + ':00',
                        '%d-%b-%Y %H:%M:%S')
                    item_['source'] = self.allowed_domains[0]

                    yield item_