Example #1
0
class A58comSpider(CrawlSpider):
    name = '58com'
    allowed_domains = ['58.com']
    start_urls = ['http://nj.58.com/hezu/0/']

    rules = (
        Rule(LinkExtractor(allow=r'hezu/0/pn\d+/'), follow=True),
        Rule(LinkExtractor(allow=r'hezu/.*?shtml'), callback='parse_item', follow=True),
    )

    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        # 'ITEM_PIPELINES': {
        #     'BaseTemp.pipelines.ImdbMongoPipeline': 300,
        # },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        # 'MONGO_DB': 'imdb',
        'JOBDIR': 'info/58crawl/001',

    }


    def parse_item(self, response):
        print(response.text)
        pass
Example #2
0
class LieSpider(CrawlSpider):
    name = 'lie'
    allowed_domains = ['chahaoba.com']
    start_urls = ['https://www.chahaoba.com/分类:骗子号码']

    rules = (
        Rule(LinkExtractor(allow=r'.*pagefrom.*'), follow=True),
        Rule(LinkExtractor(allow=r'.*?\d{11}'),
             callback='parse_item',
             follow=True),
    )

    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        # 'ITEM_PIPELINES': {
        #     'BaseTemp.pipelines.ImdbMongoPipeline': 300,
        # },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        # 'MONGO_DB': 'imdb',
        'JOBDIR': 'info/chahao/001',
    }

    def parse_item(self, response):
        print(response.url)
Example #3
0
class PhonenumSpider(scrapy.Spider):
    name = 'phonenum'
    allowed_domains = ['www.so.com']
    start_urls = ['https://www.so.com/s?q=13716919636']

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.MongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        'MONGO_DB': 'phone',
        'JOBDIR': 'info/phone/001',
        # 'LOG_FILE':'imdb_log.txt',
    }
    headers = header_list.get_header()

    def parse(self, response):
        # 解析电话号码
        item = PhoneItem()
        msg = response.xpath(
            '//p[@class="mh-detail"]/text()').extract()[0].split()
        item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
        item['phone_num'] = msg[0]
        item['area'] = msg[1]
        item['service_provider'] = msg[2]
        item['mongo_collection'] = 'info'

        yield item
Example #4
0
class CreditmanageSpider(scrapy.Spider):
    name = 'creditmanage'
    allowed_domains = ['credit-manage.com']
    start_urls = ['http://credit-manage.com/']
    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
        'RETRY_TIMES': 1000,
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.MongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
            # 'BaseTemp.middlewares.ProxyMiddleware': 10,
        },
        'MONGO_DB': 'creditmanage',
        # 'JOBDIR': 'info/hc360/002',
    }

    def parse(self, response):
        # 发起请求
        yield scrapy.FormRequest(url='http://credit-manage.com/search.htm',
                                 formdata={'condition': '杨勇'},
                                 callback=self.parse_page,
                                 headers=self.headers)

    def parse_page(self, response):
        print(response.text)
        pass
Example #5
0
class ImdbcrawlSpider(CrawlSpider):
    name = 'imdbcrawl'
    allowed_domains = ['www.imdb.cn']
    start_urls = ['http://www.imdb.cn/NowPlaying/']

    rules = (
        Rule(LinkExtractor(allow=r'Sections/.*'), follow=True),
        Rule(LinkExtractor(allow=r'title/tt\d+'),
             callback='parse_item',
             follow=True),
        # Rule(),
        # Rule(),
    )

    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.ImdbMongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        'MONGO_DB': 'imdb',
        'JOBDIR': 'info/imdbcrawl.cn/001',
    }

    def parse_item(self, response):

        movie_item = BasetempItem()
        movie_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
        movie_item['title'] = response.xpath(
            '//div[@class="fk-3"]/div/h3/text()').extract()[0].strip()
        movie_item['time'] = self.get_time(response)
        movie_item['area'] = self.get_area(response)
        movie_item['mongo_collection'] = 'movie1'  # 选择mongo表

        yield movie_item

    def get_time(self, response):

        if re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text):
            time = re.search('<i>上映时间:</i><a.*?>(\d+)</a>',
                             response.text).group(1).strip()
        else:
            time = ''
        return time

    def get_area(self, response):

        if re.search('<i>国家:</i><a.*?>(.*?)</a>', response.text):
            area = re.search('<i>国家:</i><a.*?>(.*?)</a>',
                             response.text).group(1).strip()
        else:
            area = ''
        return area
Example #6
0
class Hc360Spider(scrapy.Spider):
    name = 'hc360'
    allowed_domains = ['hc360.com']
    start_urls = ['https://js.hc360.com/category/cn.html']
    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408],
        'RETRY_TIMES': 1000,
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.MongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
            # 'BaseTemp.middlewares.ProxyMiddleware': 10,
        },
        'MONGO_DB': 'hc360',
        # 'JOBDIR': 'info/hc360/002',
    }

    def parse(self, response):
        area_url = response.xpath('//article/ul/li/a/@href').extract()
        # area_url = ['https://js.hc360.com/cn/sh/']
        for url in area_url:
            yield scrapy.Request(url=response.urljoin(url),
                                 headers=self.headers,
                                 callback=self.pares_page_num)

    def pares_page_num(self, response):
        # 第一次请求时截获全部页面数,用于下面直接发请求
        # 使用自动寻找是系统会直接ban掉下一页的链接
        nums = re.search('共(.*)页', response.text).group(1)
        page_num = int(nums.strip())
        for num in range(2, page_num):
            page_url = response.url + str(num) + '/'
            yield scrapy.Request(url=page_url,
                                 headers=self.headers,
                                 callback=self.parse_area)

    def parse_area(self, response):
        # 解析构造本地企业url
        item = HcUrlItem()
        comp_url = response.xpath('//article/ul/li/div')
        for url in comp_url:
            item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
            item['comp_name'] = url.xpath('a/text()').extract()[0]
            url_temp = url.xpath('a/@href').extract()[0]
            item['comp_id'] = re.search('/company-(.*?)/', url_temp).group(1)
            item['comp_page'] = item[
                'comp_id'] + 'b2b.hc360.com/shop/company.html'
            item['mongo_collection'] = 'url'
            yield item
Example #7
0
class ChahaoSpider(scrapy.Spider):
    name = 'chahao'
    allowed_domains = ['chahaoba.com']
    start_urls = ['https://www.chahaoba.com/分类:骗子号码']

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 1,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.MongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        'MONGO_DB': 'swindler',
        'JOBDIR': 'info/chahaoba/001',
        # 'LOG_FILE':'imdb_log.txt',
    }
    headers = header_list.get_header()

    def parse(self, response):
        item = ChaohaoItem()
        temps = response.xpath('//div[@class="mw-category"]//li')
        print(temps)
        for temp in temps:
            nums = temp.xpath('a/text()').extract()[0]
            num = get_all_num(nums)
            item['num'] = num
            item['lables'] = '诈骗'
            item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
            item['mongo_collection'] = 'tel'
            yield item

        try:
            next_page = re.search('.*<a href="(.*?)" title=".*?">下一页</a>',
                                  response.text).group(1)
            next_page = response.urljoin(
                next_page.replace('amp;', '').replace('amp', ''))
            yield scrapy.Request(next_page,
                                 headers=self.headers,
                                 callback=self.parse)

        except Exception as e:
            print('已经是最后一页')
Example #8
0
class LieSpider(scrapy.Spider):
    name = 'pianzi'
    allowed_domains = ['pianzi.com.cn']
    start_urls = ['http://www.pianzi.com.cn/shouji_1/']
    headers = header_list.get_header()

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 1,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.JsonPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        'MONGO_DB': 'swindler',
        'JOBDIR': 'info/pianzi/002',
        # 'LOG_FILE':'imdb_log.txt',
    }

    def parse(self, response):
        item = ChaohaoItem()
        temps = response.xpath(
            '//ul[@class="news_list"]/li/a/@title').extract()
        for temp in temps:
            item['num'] = temp
            item['lables'] = '被举报电话'
            item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
            # item['mongo_collection'] = 'tel'
            yield item

        try:
            next_page = re.search('.*<a href="(.*?)">下一页</a>',
                                  response.text).group(1)
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page,
                                 headers=self.headers,
                                 callback=self.parse)

        except Exception as e:
            print('已经是最后一页')

    def parse_item(self, response):
        pass
Example #9
0
class ChahaoSpider(scrapy.Spider):
    name = 'chahao'
    allowed_domains = ['chahaoba.com']
    start_urls = [
        'https://www.chahaoba.com/index.php?title=%E5%88%86%E7%B1%BB:%E9%AA%97%E5%AD%90%E5%8F%B7%E7%A0%81&amp%3Bpagefrom=%2B02227393016%EF%BC%9B%2B37911183&pagefrom=%2B0222999767#mw-pages'
    ]

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        # 'ITEM_PIPELINES': {
        #     'BaseTemp.pipelines.ImdbMongoPipeline': 300,
        # },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        # 'MONGO_DB': 'imdb',
        # 'JOBDIR': 'info/chahao/001',
        # 'LOG_FILE':'imdb_log.txt',
    }
    headers = header_list.get_header()
    page = 1

    def parse(self, response):

        nums = response.xpath('//div[@class="mw-category"]//li')
        for num in nums:
            phone_nums = get_num(num.xpath('a/text()').extract()[0])
            if len(phone_nums) == 11 and phone_nums.startswith('1'):
                num_link = response.urljoin(phone_nums)
                phone_num = phone_nums
                yield scrapy.Request(url=num_link,
                                     headers=self.headers,
                                     meta={'phone_num': phone_num},
                                     callback=self.parse_detail)

        try:
            next_page = re.search('.*<a href="(.*?)" title=".*?">下一页</a>',
                                  response.text).group(1)
            next_page = response.urljoin(
                next_page.replace('amp;', '').replace('amp', ''))
            yield scrapy.Request(next_page,
                                 headers=self.headers,
                                 callback=self.parse)

        except Exception as e:
            print('已经是最后一页')

    def parse_detail(self, response):
        # 解析详情页面
        try:
            area = re.search('归属省份地区:<a href=".*?">(.*?)</a>',
                             response.text).group(1)
        except:
            area = '未知'
        try:
            provider = re.search('电信运营商:<a href=".*?">(.*?)</a>',
                                 response.text).group(1)
        except:
            provider = '未知'

        title = '诈骗电话'

        print(title)
        print(area)
        print(provider)
Example #10
0
class ImdbSpider(scrapy.Spider):
    name = 'imdb'
    allowed_domains = ['www.imdb.cn']
    start_urls = ['http://www.imdb.cn/nowplaying/1']

    custom_settings = {
        # do not needs login project
        'COOKIES_ENABLED': False,
        'ITEM_PIPELINES': {
            'BaseTemp.pipelines.ImdbMongoPipeline': 300,
        },
        # do not needs login project
        'DOWNLOADER_MIDDLEWARES': {
            'BaseTemp.middlewares.UserAgentMiddleware': 200,
        },
        'MONGO_DB': 'imdb',
        'JOBDIR': 'info/imdb.com/001',
        # 'LOG_FILE':'imdb_log.txt',
    }
    headers = header_list.get_header()

    def parse(self, response):
        # 获取下一页 并抽取详情链接
        for page in range(1, 2):
            movie_url = response.xpath(
                '//div[@class="ss-3 clear"]/a/@href').extract()
            for url in movie_url:
                yield scrapy.Request(url=response.urljoin(url),
                                     headers=self.headers,
                                     callback=self.parse_movie)

            next_page = 'http://www.imdb.cn/nowplaying/{0}'.format(page)
            yield scrapy.Request(url=next_page, callback=self.parse)

    def parse_movie(self, response):
        # 解析电影页面
        movie_item = BasetempItem()
        movie_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d')
        movie_item['title'] = response.xpath(
            '//div[@class="fk-3"]/div/h3/text()').extract()[0].strip()
        movie_item['time'] = self.get_time(response)
        movie_item['area'] = self.get_area(response)
        movie_item['mongo_collection'] = 'movie'  #选择mongo表

        yield movie_item

    def get_time(self, response):

        if re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text):
            time = re.search('<i>上映时间:</i><a.*?>(\d+)</a>',
                             response.text).group(1).strip()
        else:
            time = ''
        return time

    def get_area(self, response):

        if re.search('<i>国家:</i><a.*?>(.*?)</a>', response.text):
            area = re.search('<i>上映时间:</i><a.*?>(\d+)</a>',
                             response.text).group(1).strip()
        else:
            area = ''
        return area