def parse(self, response): sel = Selector(response) item = SpiderItem() html = response.xpath('//a/@href').extract() clinet = pymongo.MongoClient("localhost", 27017) db = clinet["sihu"] for h in html: if h.endswith('mp4'): if (db['mp4'].find({"mp4": h}).count() == 0): item['mp4'] = h print(h) yield item yield SpiderItem else: print('already in') # print '``````````````get MP4``````````````````' # else: url = 'https://www.27sihu.com' + h # print url item['html'] = url yield Request(url, callback=self.parse)
def parse(self, response): # print(response.url) 打印网页地址 # print(response.text) #打印内容 movies = Selector(response=response).xpath("//dd") for i in movies: item = SpiderItem() title = i.xpath( "./div[@class='channel-detail movie-item-title']/a/text()" ).get() link = "https://maoyan.com" + i.xpath( "./div[@class='channel-detail movie-item-title']/a/@href").get( ) if i.xpath( './div[@class="channel-detail channel-detail-orange"]/text()' ).get() == "暂无评分": score = "暂无评分" else: score = i.xpath('./div/i/text()').getall()[0] + i.xpath( './div/i/text()').getall()[1] item['score'] = score item["title"] = title item["link"] = link yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
def parse(self, response): item = SpiderItem() dt = add_date(0).strftime('%Y-%m-%d') apps = response.xpath('//*[@id="iconList"]/li') for app in apps: app_name = app.xpath('./h3/a/text()').extract()[0] apk_name = app.xpath('./a[2]/@href').extract()[0].split('/')[-1].replace('.apk', '') item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '360应用市场']) yield item
def parse_page(self, response): data = json.loads(response.xpath('/html/body/p/text()').extract()[0])['data'] dt = datetime.datetime.now().strftime('%Y-%m-%d') for tem in data: item = SpiderItem() app_name = tem['displayName'].replace(',', '&') apk_name = tem['packageName'].replace(',', '&') item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '小米应用市场']) yield item
def parse(self, response): page_content = int(response.url.split('&')[-1].split('=')[1]) result = json.loads(response.xpath('/html/body/p/text()').extract()[0]) dt = datetime.datetime.now().strftime('%Y-%m-%d') if len(result['obj']) != 0: for tem in result['obj']: item = SpiderItem() app_name = tem['appName'].replace(',', '&') apk_name = tem['pkgName'].replace(',', '&') item['row'] = ','.join([dt, app_name, apk_name, '金融理财', '', '腾讯应用宝']) yield item page_content += 20 yield scrapy.Request('http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=114&pageSize=20&pageContext=%s' % page_content, callback=self.parse)
def parse_page(self, response): item = SpiderItem() dt = add_date(0).strftime('%Y-%m-%d') apps = response.xpath('//*[@id="doc"]/div[3]/div[1]/div/ul/li') for app in apps: app_name = app.xpath( './a/div/p[3]/span/@data_name').extract()[0].replace(',', '&') apk_name = app.xpath( './a/div/p[3]/span/@data_package').extract()[0].replace( ',', '&') item['row'] = ','.join( [dt, app_name, apk_name, '金融理财', '', '百度应用市场']) yield item
def parse(self, response): trows = response.xpath(r'//*[@id="ctl00_ContentPlaceHolder1_up_Product_GridView1"]/tbody/tr') for row in trows: item = SpiderItem() item["name_EN"] = row.xpath("./td/div/div[1]/a/text()").extract()[0] item["name_EN"] = self.remove_space(item["name_EN"]) if re.search(r"\,[0-9\.]{1,5}%", item["name_EN"]): item["name_EN"] = re.sub(r"\,[0-9\.]{1,5}%", "", item["name_EN"]) item["name_CH"] = row.xpath("./td/div/div[1]/a/span[1]/text()").extract()[0] item["name_CH"] = self.remove_space(item["name_CH"]) item["purity"] = row.xpath("./td/div/div[2]/div[2]/ul/li[1]/text()").extract()[0] item["purity"] = self.remove_space(item["purity"]).split(":")[1] item["CAS"] = row.xpath("./td/div/div[2]/div[2]/ul/li[2]/a/text()").extract()[0] item["CAS"] = self.remove_space(item["CAS"]) item["MDL"] = row.xpath("./td/div/div[2]/div[2]/ul/li[3]/a/text()").extract()[0] item["MDL"] = self.remove_space(item["MDL"]) item["prod_num"] = row.xpath("./td/div/div[2]/div[2]/ul/li[4]/text()").extract()[0] item["prod_num"] = self.remove_space(item["prod_num"]).split(":")[1] item["structure"] = row.xpath("./td/div/div[2]/div[2]/ul/li[5]").xpath("string(.)").extract()[0] item["structure"] = self.remove_space(item["structure"]).split(":")[1] item['picture_url'] = row.xpath("./td/div/div[2]/div[1]/img/@src").extract()[0] item["picture_url"] = "{:s}{:s}".format(self.site_root, self.remove_space(item["picture_url"])) item["Smiles"] = self.get_smiles_by_name(item["name_EN"]) sell_table = row.xpath('.//div[@class="PRODUCT_box"]/div[contains(@id,"up_PackAge_dv_PackAge")]/div/table/tbody') sell_info_list = [] for info_table_row in sell_table.xpath("./tr")[1:]: tds = info_table_row.xpath("./td") sell_info = dict() sell_info["package"] = tds[0].xpath("string(.)").extract()[0] sell_info["unit_price"] = tds[1].xpath("string(.)").extract()[0] sell_info["arrive_time"] = tds[4].xpath("string(.)").extract()[0] if sell_info["arrive_time"] == '现货\xa0详情': sell_info["arrive_time"] = "现货" sell_info_list.append(sell_info) item["sell_info"] = sell_info_list yield item next_url = response.xpath('//*[@id="ctl00_ContentPlaceHolder1_Next"]/@href').extract()[0] if next_url: next_url = "http://{:s}{:s}".format(self.site_root, next_url) next_url = urllib.parse.quote(next_url, safe=":/") yield SplashRequest(url=next_url, callback=self.parse, endpoint='execute', args={"lua_source": self.lua_script, "wait": 2.0, "url": next_url}, encoding='utf-8')
def parse(self,response): logging.info("********************New page*********************") for info in response.xpath('//div[@data-role="ershoufang"]/div[1]/a'): infoItem = SpiderItem() infoItem["name_distinct"] = info.xpath('.//text()').extract_first() distinct_link = info.xpath('.//@href').extract_first() infoItem["href_distinct"] = response.urljoin(distinct_link) # logging.info(infoItem["name_distinct"]) # logging.info(infoItem["href_distinct"]) next_page = infoItem["href_distinct"] if next_page is not None: #print next_page #注意这里一定要写 yield 才可以继续发起请求 yield scrapy.Request(next_page,callback=self.parse_community)
def parse_page(self, response): item = SpiderItem() dt = add_date(0).strftime('%Y-%m-%d') try: app_name = response.xpath( '/html/body/div[5]/div[1]/div[6]/div[1]/ul/li[1]/text()' ).extract()[0].replace(',', '&') apk_name = response.xpath( '/html/body/div[5]/div[1]/div[6]/div[1]/ul/li[2]/text()' ).extract()[0].replace(',', '&') item['row'] = ','.join( [dt, app_name, apk_name, '金融理财', '', '木蚂蚁论坛']) yield item except Exception as e: print(e)
def parse_page(self, response): app_counts = len(response.xpath('//*[@id="j-tag-list"]/li')) dt = datetime.datetime.now().strftime('%Y-%m-%d') dic_label = {'5023_631': '支付', '5023_628': '炒股', '5023_627': '银行', '5023_958': '理财记账', '5023_629': '彩票', '5023_955': '借贷', '5023_981': '投资', '5023_1003': '保险'} label_code = response.url.split('/')[4] label = dic_label[label_code] for i in range(app_counts): item = SpiderItem() app_name = response.xpath('//*[@id="j-tag-list"]/li[%s]/div[2]/h2/a/text()' % (i+1)).extract() apk_name = response.xpath('//*[@id="j-tag-list"]/li[%s]/div[2]/h2/a/@href' % (i+1)).extract() if len(app_name) > 0 and len(apk_name) > 0: app_name = app_name[0].replace(',', '&') apk_name = apk_name[0].split('/')[-1].replace(',', '&') item['row'] = ','.join([dt, app_name, apk_name, '金融理财', label, '豌豆荚']) yield item