Example #1
0
    def parse(self, response):
        sel = Selector(response)
        item = SpiderItem()
        html = response.xpath('//a/@href').extract()

        clinet = pymongo.MongoClient("localhost", 27017)
        db = clinet["sihu"]

        for h in html:
            if h.endswith('mp4'):
                if (db['mp4'].find({"mp4": h}).count() == 0):
                    item['mp4'] = h
                    print(h)
                    yield item
                    yield SpiderItem
                else:
                    print('already in')

            #  print '``````````````get MP4``````````````````'
            #
            else:
                url = 'https://www.27sihu.com' + h
                # print url
                item['html'] = url
                yield Request(url, callback=self.parse)
Example #2
0
    def parse(self, response):
        # print(response.url) 打印网页地址
        # print(response.text) #打印内容

        movies = Selector(response=response).xpath("//dd")

        for i in movies:
            item = SpiderItem()
            title = i.xpath(
                "./div[@class='channel-detail movie-item-title']/a/text()"
            ).get()
            link = "https://maoyan.com" + i.xpath(
                "./div[@class='channel-detail movie-item-title']/a/@href").get(
                )
            if i.xpath(
                    './div[@class="channel-detail channel-detail-orange"]/text()'
            ).get() == "暂无评分":
                score = "暂无评分"
            else:
                score = i.xpath('./div/i/text()').getall()[0] + i.xpath(
                    './div/i/text()').getall()[1]

            item['score'] = score
            item["title"] = title
            item["link"] = link
            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 callback=self.parse2)
Example #3
0
 def parse(self, response):
     item = SpiderItem()
     dt = add_date(0).strftime('%Y-%m-%d')
     apps = response.xpath('//*[@id="iconList"]/li')
     for app in apps:
         app_name = app.xpath('./h3/a/text()').extract()[0]
         apk_name = app.xpath('./a[2]/@href').extract()[0].split('/')[-1].replace('.apk', '')
         item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '360应用市场'])
         yield item
Example #4
0
 def parse_page(self, response):
     data = json.loads(response.xpath('/html/body/p/text()').extract()[0])['data']
     dt = datetime.datetime.now().strftime('%Y-%m-%d')
     for tem in data:
         item = SpiderItem()
         app_name = tem['displayName'].replace(',', '&')
         apk_name = tem['packageName'].replace(',', '&')
         item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '小米应用市场'])
         yield item
Example #5
0
 def parse(self, response):
     page_content = int(response.url.split('&')[-1].split('=')[1])
     result = json.loads(response.xpath('/html/body/p/text()').extract()[0])
     dt = datetime.datetime.now().strftime('%Y-%m-%d')
     if len(result['obj']) != 0:
         for tem in result['obj']:
             item = SpiderItem()
             app_name = tem['appName'].replace(',', '&')
             apk_name = tem['pkgName'].replace(',', '&')
             item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '腾讯应用宝'])
             yield item
         page_content += 20
         yield scrapy.Request('http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=114&pageSize=20&pageContext=%s' % page_content, callback=self.parse)
Example #6
0
 def parse_page(self, response):
     item = SpiderItem()
     dt = add_date(0).strftime('%Y-%m-%d')
     apps = response.xpath('//*[@id="doc"]/div[3]/div[1]/div/ul/li')
     for app in apps:
         app_name = app.xpath(
             './a/div/p[3]/span/@data_name').extract()[0].replace(',', '&')
         apk_name = app.xpath(
             './a/div/p[3]/span/@data_package').extract()[0].replace(
                 ',', '&')
         item['row'] = ','.join(
             [dt, app_name, apk_name, '金融理财', '', '百度应用市场'])
         yield item
Example #7
0
    def parse(self, response):
        trows = response.xpath(r'//*[@id="ctl00_ContentPlaceHolder1_up_Product_GridView1"]/tbody/tr')
        for row in trows:
            item = SpiderItem()
            item["name_EN"] = row.xpath("./td/div/div[1]/a/text()").extract()[0]
            item["name_EN"] = self.remove_space(item["name_EN"])
            if re.search(r"\,[0-9\.]{1,5}%", item["name_EN"]):
                item["name_EN"] = re.sub(r"\,[0-9\.]{1,5}%", "", item["name_EN"])

            item["name_CH"] = row.xpath("./td/div/div[1]/a/span[1]/text()").extract()[0]
            item["name_CH"] = self.remove_space(item["name_CH"])

            item["purity"] = row.xpath("./td/div/div[2]/div[2]/ul/li[1]/text()").extract()[0]
            item["purity"] = self.remove_space(item["purity"]).split(":")[1]

            item["CAS"] = row.xpath("./td/div/div[2]/div[2]/ul/li[2]/a/text()").extract()[0]
            item["CAS"] = self.remove_space(item["CAS"])

            item["MDL"] = row.xpath("./td/div/div[2]/div[2]/ul/li[3]/a/text()").extract()[0]
            item["MDL"] = self.remove_space(item["MDL"])

            item["prod_num"] = row.xpath("./td/div/div[2]/div[2]/ul/li[4]/text()").extract()[0]
            item["prod_num"] = self.remove_space(item["prod_num"]).split(":")[1]

            item["structure"] = row.xpath("./td/div/div[2]/div[2]/ul/li[5]").xpath("string(.)").extract()[0]
            item["structure"] = self.remove_space(item["structure"]).split(":")[1]

            item['picture_url'] = row.xpath("./td/div/div[2]/div[1]/img/@src").extract()[0]
            item["picture_url"] = "{:s}{:s}".format(self.site_root, self.remove_space(item["picture_url"]))

            item["Smiles"] = self.get_smiles_by_name(item["name_EN"])

            sell_table = row.xpath('.//div[@class="PRODUCT_box"]/div[contains(@id,"up_PackAge_dv_PackAge")]/div/table/tbody')
            sell_info_list = []
            for info_table_row in sell_table.xpath("./tr")[1:]:
                tds = info_table_row.xpath("./td")
                sell_info = dict()
                sell_info["package"] = tds[0].xpath("string(.)").extract()[0]
                sell_info["unit_price"] = tds[1].xpath("string(.)").extract()[0]
                sell_info["arrive_time"] = tds[4].xpath("string(.)").extract()[0]
                if sell_info["arrive_time"] == '现货\xa0详情':
                    sell_info["arrive_time"] = "现货"
                sell_info_list.append(sell_info)
            item["sell_info"] = sell_info_list
            yield item
        next_url = response.xpath('//*[@id="ctl00_ContentPlaceHolder1_Next"]/@href').extract()[0]
        if next_url:
            next_url = "http://{:s}{:s}".format(self.site_root, next_url)
            next_url = urllib.parse.quote(next_url, safe=":/")
            yield SplashRequest(url=next_url, callback=self.parse, endpoint='execute', args={"lua_source": self.lua_script, "wait": 2.0, "url": next_url}, encoding='utf-8')
    def parse(self,response):
        logging.info("********************New page*********************")
        for info in response.xpath('//div[@data-role="ershoufang"]/div[1]/a'):
            infoItem = SpiderItem()
            infoItem["name_distinct"] = info.xpath('.//text()').extract_first()
            distinct_link = info.xpath('.//@href').extract_first()
            infoItem["href_distinct"] = response.urljoin(distinct_link)
            # logging.info(infoItem["name_distinct"])
            # logging.info(infoItem["href_distinct"])
            next_page = infoItem["href_distinct"]
            if next_page is not None:
                #print next_page

                #注意这里一定要写 yield 才可以继续发起请求
                yield scrapy.Request(next_page,callback=self.parse_community)
Example #9
0
 def parse_page(self, response):
     item = SpiderItem()
     dt = add_date(0).strftime('%Y-%m-%d')
     try:
         app_name = response.xpath(
             '/html/body/div[5]/div[1]/div[6]/div[1]/ul/li[1]/text()'
         ).extract()[0].replace(',', '&')
         apk_name = response.xpath(
             '/html/body/div[5]/div[1]/div[6]/div[1]/ul/li[2]/text()'
         ).extract()[0].replace(',', '&')
         item['row'] = ','.join(
             [dt, app_name, apk_name, '金融理财', '', '木蚂蚁论坛'])
         yield item
     except Exception as e:
         print(e)
Example #10
0
 def parse_page(self, response):
     app_counts = len(response.xpath('//*[@id="j-tag-list"]/li'))
     dt = datetime.datetime.now().strftime('%Y-%m-%d')
     dic_label = {'5023_631': '支付', '5023_628': '炒股', '5023_627': '银行', '5023_958': '理财记账', '5023_629': '彩票',
                 '5023_955': '借贷', '5023_981': '投资', '5023_1003': '保险'}
     label_code = response.url.split('/')[4]
     label = dic_label[label_code]
     for i in range(app_counts):
         item = SpiderItem()
         app_name = response.xpath('//*[@id="j-tag-list"]/li[%s]/div[2]/h2/a/text()' % (i+1)).extract()
         apk_name = response.xpath('//*[@id="j-tag-list"]/li[%s]/div[2]/h2/a/@href' % (i+1)).extract()
         if len(app_name) > 0 and len(apk_name) > 0:
             app_name = app_name[0].replace(',', '&')
             apk_name = apk_name[0].split('/')[-1].replace(',', '&')
             item['row'] = ','.join([dt, app_name, apk_name, '金融理财', label, '豌豆荚'])
             yield item