def parse(self, response): try: json_obj = json.loads(response.text()) results = json_obj["results"]["reviews"] page_id = json_obj["results"]["page_id"] if len(results) > 0: with open("reviews.json", "a+") as f: for review in results: review["page_id"] = page_id f.write(json.dumps(review) + "\n") if "next_page_url" in json_obj["paging"]: next_page_url = json_obj["paging"]["next_page_url"] if len(next_page_url) > 0: logger.info("next paging: %s", next_page_url) # yield scrapy.Request(url=next_page_url, # headers=self.headers, # callback=self.parse) return except Exception as e: logger.exception("Error to parse review: %s", e) pass
def parse_company_page(self, response): item = HuicongItem() item["companyName"] = response.meta["company_name"] item["companyUrl"] = response.meta['company_url'] soup = BeautifulSoup(response.body) detail_info = soup.select( 'div[class^="contentbox"] div[class^="detailsinfo"]') self.get_company_info(detail_info, item) try: item['memberInfo'] = re.sub( '[\t\n\r]', '', soup.select('div[class^="contentbox"] div[class^="memyear"]') [0].find("span").text.strip()) except: item['memberInfo'], item['MyeeigIndex'], item['merchantGrade'] = [ "", "", "" ] else: if u"买卖通会员" not in item['memberInfo']: item['MyeeigIndex'] = "" else: item['MyeeigIndex'] = soup.select( 'div[class^="contentbox"] div[class^="comInfogo"] span[class^="redbold14"]' )[0].a.text item['merchantGrade'] = soup.select( 'div[class^="contentbox"] div[style^="color"] a[target^="_blank"]' )[0].img['src'].split('/')[-1].split('.')[0] item['contactPerson'], item['cellphone'], item['phone'], item[ 'fax'] = "", "", "", "" try: contact_info = soup.select( 'div[class^="contentbox"] div[class^="contactbox"]')[0] except Exception as ex: logger.info(f"[SPIDERCONTACT] {response.url} {ex}") else: item['contactPerson'] = ''.join([ info.text.strip() for info in contact_info.find_all("li")[0].find_all("span") ]) contact_ways = contact_info.find_all("li")[2:-1] try: for contact_way in contact_ways: if u'电话' in contact_way['title'].split(u":"): item['phone'] = contact_way['title'].split(u":")[1] if u'手机' in contact_way['title'].split(u":"): item['cellphone'] = contact_way['title'].split(u":")[1] if u'传真' in contact_way['title'].split(u":"): item['fax'] = contact_way['title'].split(u":")[1] except Exception as ex: logger.info(f"[SPIDERCONTACTWAY] {response.url} {ex}") item['collctTime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return item
def process_item(self, item, spider): if isinstance(item, BiqugeItem): flag = 1 name = item['name'] author = item['book_author'] _id = item['book_id'] url = item['book_url'] category = item['category'] status = item['status'] ret = Sql.check_book_id(_id) if ret != 1: Sql.insert_message(name, author, category, status, url, _id, flag=flag) else: logger.info('该书 {} 已存在数据库'.format(name)) if isinstance(item, BookItem): flag = 2 num = item['num'] chapter_name = item['chapter_name'] chapter_url = item['chapter_url'] chapter_id = item['chapter_id'] book_id = item['book_id'] book_name = item['book_name'] chapter_content = item['chapter_content'] ret = Sql.check_chapter_id(chapter_id) if ret != 1: Sql.insert_message(book_name, book_id, chapter_name, chapter_id, chapter_url, chapter_content, flag=flag) else: logger.info('该章节 {} {} 已存在数据库'.format(book_name, chapter_name)) return item
def parse_login(self, response): info = json.loads(response.text) if info['error'] == '0': logger.info('登录成功:-)') return super().start_requests() logger.info('登录失败:-(, 重新登录...') return self.start_requests()
def __init__(self): with open("data/list_user_agent.txt") as f: self.user_agent_list = [] for line in f: self.user_agent_list.append(line.strip().replace(",", "")) self.number_of_requests_interval = 100 logger.info("Enable to get randon user-agent")
def get_content(self, response): meta = response.meta item = JoviLonglasttimeItem() item['article_url'] = response.url item['first_tag'] = meta['first_tag'] item['second_tag'] = meta['second_tag'] host = urlparse(response.url).netloc xpath = self.xpath.get(host) if xpath: item['article_title'] = response.xpath( xpath['title']).get().strip() ps = response.xpath(xpath['ps']).getall() else: logger.info('This URL parsing xpath is not settled:{}'.format( response.url)) print(response.url) return content = '' for p in ps: if re.search( r'责任编辑:|作者:|出处:|{}|来自:|来源 :|来源:|来源 : |图片来自|图片由|图:|更多精彩|请投稿至:|文|文/|编辑', p): continue elif re.search(r'关注微信公众号|参考资料|声明:|原网页已经由 ZAKER 转码排版 |推荐阅读', p): break else: content += p.strip() item['article_content'] = content.replace('\n', '').replace( '\r', '').replace('\t', '').replace('\u3000', '').replace('\xa0', '') yield item
def close_spider(self, spider): if len(self.goods_items) > 0: self.insert_many_goods() self.client.close() print("最终插入{}条".format(self.insert_num)) self.stats.set_value("finaly_insert_item", self.insert_num) logger.info("最终插入{}条".format(self.insert_num))
def parse(self, response): with open('./detail.txt', 'wb') as f: f.write(response.body) f.close() itemdetail = TesterhomeDetailSpiderItem() itemdetail['topic_id'] = response.xpath('//a[contains(@class, "qrcode")]/@data-url').extract()[0].split('/')[-1] itemdetail['topic_title'] = response.xpath('//div[contains(@class, "media-body")]/h1/text()').extract()[0] topic_body = '' for i in response.xpath('//div[contains(@class, "panel-body markdown")]/article/p/text()').extract(): topic_body += i # topic_body itemdetail['topic_body'] = topic_body itemdetail['topic_author'] = response.xpath('//a[contains(@data-author, "true")]/@data-name').extract()[0] itemdetail['topic_like_num'] = response.xpath('//a[contains(@class, "likeable")]/@data-count').extract()[0] itemdetail['topic_reply_num'] = response.xpath('//div[contains(@class, "total panel-heading")]/b/text()').extract()[0] itemdetail['topic_timeago'] = response.xpath('//abbr[contains(@class, "timeago")]/@title').extract()[0] yield itemdetail # for sel in response.xpath('//div[contains(@class, "infos")]'): # itemreplydetail = TesterhomeDetailReplySpiderItem() # itemreplydetail['topic_reply_author'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "name")]/a/@data-name').extract()[0] # itemreplydetail['topic_reply_timeago'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "time")]/abbr/@title').extract()[0] # itemreplydetail['topic_reply_like_num'] = sel.xpath('div[contains(@class, info)]/span[contains(@class, "opts pull-right")]/a[contains(@class, "likeable")]/@data-count').extract()[0] # for i in sel.xpath('div[contains(@class, markdown)]/p/text()').extract(): # itemreplydetail['topic_reply_author'] += i # yield itemreplydetail if int(itemdetail['topic_id']) < 100: yield Request('http://testerhome.com/topics/'+str(int(itemdetail['topic_id'])+1), callback=self.parse) # self.start_urls.append('http://testerhome.com/topics/'+str(int(itemdetail['topic_id'])+1)) else: logger.info('topic_id > 100')
def from_crawler(cls, crawler, *args, **kwargs): spider = super(ListeningKafkaSpider, cls).from_crawler(crawler, *args, **kwargs) if not hasattr(spider, 'topic') or not spider.topic: spider.topic = '%s-starturls' % spider.name hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = crawler.settings.get( 'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue spider.consumer = SimpleConsumer(_kafka, consumer_group, spider.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic) return spider
def process_request(self, request, spider): print("**************ProxyMiddleware have pass************" + self.proxy) request.meta['proxy'] = "http://" + self.proxy request.meta['handle_httpstatus_list'] = [200] logger.info("process_request eval! %s" % request.meta)
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic)
def parse_artclieItem(self, item, spider): article = dict() title = join(item['title']).replace(" ", "") auther = join(item['auther']).replace(" ", "") source = join(item['source']).replace(" ", "") catalog = join(item['catalog']).replace(" ", "") art_url = join(item['artUrl']).replace(" ", "") create_time = item['createTime'] artTime = join(item['artTime']).replace(" ", "") artFromUrl = join(item['artFromUrl']).replace(" ", "") artImageUrl = join(item['artImageUrl']).replace(" ", "") conList = item['content'] # allhtml = join(item['xpathTag']) # artInfo头部文章标识信息 article['artInfo'] = { 'catalog': catalog, 'title': title, 'source': source, 'art_url': art_url, 'auther': auther, 'create_time,': create_time, 'artTime': artTime, 'artFromUrl': artFromUrl, 'artImageUrl': artImageUrl } # artTypeInfo 文章内容成分 isString = item['isString'] isImage = item['isImage'] isVideo = item['isVideo'] article['artTypeInfo'] = { 'isString': isString, 'isImage': isImage, 'isVideo': isVideo } # artContent 文章具体内容list article['artContent'] = {'content': conList} logger.info("pipelines->parse_artclieItem=" + article.__str__() + "\n itemlen=" + article.__len__().__str__()) print "pipelines->parse_artclieItem=" + article.__str__( ) + "\n itemlen=" + article.__len__().__str__() jsonOutPut = json.dumps(article, ensure_ascii=True, encoding='utf-8') logger.info("pipelines->parse_artclieItem=" + jsonOutPut + "\n itemlen=" + article.__len__().__str__()) # 输出路径 指定目录下->json文件 fileName = "vgtime/" + title + ".json" filePath = Constant.jsonOutPath + "/" + fileName jsonFile = open(filePath, 'wb') jsonOutPut.replace(" ", "") jsonFile.write(jsonOutPut.__str__()) jsonFile.close()
def parse_login(self, response): #pdb.set_trace() info = json.loads(response.text) if info['error'] == '0': logger.info('ok') return super().start_requests() logger.info('failed') return self.start_requests()
def parse_login(self, response): info = json.loads(response.text) if info['error'] == '0': logger.info('Login Success') return super().start_requests() logger.info('Login Fail') return self.start_requests()
def parse_login(self, response): # 根据响应结果判断是否登录成功 info = json.loads(response.text) if info['error'] == '0': logger.info('登录成功') return super().start_requests() logger.info('登录失败') return self.start_requests()
def init(self): with codecs.open(CRAWLED_STORE_FILE_PATH, 'r') as file: for line in file.readlines(): try: link = json.loads(line, encoding='utf8')['companyUrl'].strip() self.add(namespace_url(link)) except Exception as ex: logger.info(f"[URLFILTERERROR] {line} {ex}") logger.info(f"[URLFILTERINIT] {self.count}")
def start_requests(self): for city, url in all_city_map().items(): page = get_new_house_page(url) self.city_url = url self.city = city logger.info('{} 一共有{} 页'.format(city, page)) for i in range(1, page + 1): crawl_url = '{}/loupan/pg{}'.format(url, str(i)) yield Request(crawl_url, self.parse, dont_filter=True)
def parse_login_ed(self, response): #需要判断是否登录成功,不成功则重新登录 info = json.loads( response.text) #form请求的正文是json串,包含了用户验证的结果,转为python字典后根据error字段判断 if info['error'] == '0': logger.info('登录成功:-)') return super().start_requests() else: logger.info('登录失败:-( 重新登录...') return self.start_requests()
def process_request(self, request, spider): avaliable = [ "http://186.249.71.237:20183", 'http://39.104.55.229:8080', ] print("**************ProxyMiddleware have pass************" + self.proxy) request.meta['proxy'] = "http://" + self.proxy request.meta['download_timeout'] = 20 request.meta['retry_times'] = 2 logger.info("process_request eval! %s" % request.meta)
def process_item(self, item, spider): if isinstance(item, CommitItem): try: self.conn.commit() except Exception as e: logger.info('commit failed!') else: self.cursor.execute( self.sql, (item["bureau"], item["station"], item["name"], item["address"], item["passenger"], item["luggage"], item["package"], item["turn"]))
def parse_login(self, response): #巨坑!! 搞了两个小时,因为json外侧不能有" 号... response_text = response.text.replace("\"{", "{").replace("}\"", "}").replace( "\\", "") result = json.loads(response_text) print(response_text) print("登录结果:" + str(result['isSuccess'])) if result['isSuccess']: logger.info('登录成功') return super().start_requests() logger.info("登录失败,重新登录。") return self.start_requests()
def wrapper(self, item, spider): # message template for debugging msg = " {0} pipeline step".format(self.__class__.__name__) # if class is in the spider"s pipeline, then use the # process_item method normally. if self.__class__ in spider.pipeline: logger.info(msg.format("executing")) return process_item_method(self, item, spider) # otherwise, just return the untouched item (skip this step in # the pipeline) else: logger.info(msg.format("skipping")) return item
def wrapper(self, item, spider): # message template for debugging msg = " {0} pipeline step".format(self.__class__.__name__) # if class is in the spider"s pipeline, then use the # process_item method normally. if self.__class__ in spider.pipeline: logger.info(msg.format("executing")) return process_item_method(self, item, spider) # otherwise, just return the untouched item (skip this step in # the pipeline) else: logger.info(msg.format("skipping")) return item
def process_request(self, request, spider): try: driver = webdriver.PhantomJS( service_args=['--disk-cache=true', '--load-images=false']) # 设置缓存和禁止图片加载 wait = WebDriverWait(driver, 1) # 等待1秒浏览器进行加载 driver.set_window_size(100, 1200) if request.url.startswith("http://www.sohu.com"): other = request.url.split('depth') depth = int(other[1].split('=')[1]) url = other[0] if depth == 1 or depth == 2: try: time.sleep(1) driver.get(url) except: logger.info('出现异常1,2') #chrome浏览器提速 # chrome_opt = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images": 2} # chrome_opt.add_experimental_option("prefs", prefs) # #driver = webdriver.Chrome(service_args=['--disk-cache=true','--load-images=false']) # driver = webdriver.Chrome( chrome_options=chrome_opt) # driver.set_window_size(100, 1000) # driver.get(request.url) # js = "document.documentElement.scrollTop=document.documentElement.scrollHeight" # 滚动条下拉1000px # end = driver.find_elements_by_xpath("//div[@class='more-load' and @style='']") # count = 1 # while not len(end): # print(count) # driver.execute_script(js) # time.sleep(1) # end = driver.find_elements_by_xpath("//div[@class='more-load' and @style='']") # count += 1 count = 0 while count < 50: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight)") count += 1 logger.info('正在加载') time.sleep(0.5) content = driver.page_source.encode('utf-8') driver.close() return HtmlResponse(request.url, body=content) else: try: time.sleep(1) driver.get(url) except: logger.info('出现异常3') content = driver.page_source.encode('utf-8') driver.close() return HtmlResponse(request.url, body=content) except: logger.info('出现异常')
def parse(self, response): t1 = time.time() html = scrapy.Selector(text=response.text) divs = html.css("#content_left > div .f13 .c-tools::attr(data-tools)") for div in divs: data_str = div.extract() data_dict = json.loads(data_str) url = None try: url = requests.get(data_dict['url'], timeout=5).url schame = urllib.parse.urlparse(url).netloc sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)" self.mysql.excute_sql(sql) except Exception as e: logger.error( f"requests.get(data_dict['url']).url ===>>> {str(e)}") t2 = time.time() logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
def openlink(self, url): """ urlopen error 10060错误 :param url: 请求的网址 :param headers: 报文头部信息 :return: 服务器响应 """ maxTryNum = 15 for tries in range(maxTryNum): try: logging.info("请求%s" % (url)) req = requests.get(url, timeout=13, headers=self.headers) logging.info('请求成功%s' % url) return req except: if tries < (maxTryNum - 1): continue else: logger.info("尝试%d 次连接网址%s失败!" % (maxTryNum, url))
def parse(self, response): t1=time.time() html=scrapy.Selector(text=response.text) divs=html.css("div.results > div") for div in divs: vrwrap=div.css("div.vrwrap") if len(vrwrap)==0: title = "".join(div.css("div.rb h3 a::text").extract()) url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0] else: title="".join(div.css("div.vrwrap h3 a::text").extract()) url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0] try: _html=scrapy.Selector(text=requests.get(url,verify=False).text) url = _html.re("window.location.replace\(\"(.*?)\"\)")[0] schame = urllib.parse.urlparse(url).netloc sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)" self.mysql.excute_sql(sql) except Exception as e: logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}") t2=time.time() logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
def parse(self, response): urls = response.css("#__next > div")[1].css( "div.eWvQyF > div > div > div >a::attr(href)").extract() if len(urls) == 0: logger.info("Not found urls in menu bar") logger.info("Try to fetch on top categories") urls = response.css( "#__next div.eIYDfA > div > div > a::attr(href)").extract() if len(urls) == 0: urls.append(response.url) with open("level2.txt", "a") as f: for url in urls: f.write(url + "\n") logger.info("Write %s urls to file", len(urls))
def parse_item(self,response): from spider_service.items import TBItem from spider_service.common.tools import LevelTool selector = scrapy.Selector(response) data_obj = TBItem() item_rsp_text = response.body_as_unicode() if response.status in [302,301]: redirect_url = response.url if response.headers.get('Location') and 'login.taobao.com/jump?target=' in response.headers.get('Location') and 'login.taobao.com/jump' not in response.url: redirect_url = response.headers.get('Location') item_rsp = RequestHandle.get_rsp(redirect_url) item_rsp_text = item_rsp.text selector = scrapy.Selector(item_rsp) if 'detail.tmall.com/item.htm' in response.url or 'tmall.com' in response.url or True: title = selector.xpath('//h3[@class="tb-main-title"]/@data-title').extract_first() if not title: title = selector.xpath('//input[@name="title"]/@value').extract_first() if not title: logger.info('no find title ,response url:' + response.url) return #原始价格,促销价格获取经常需验证 price = response.xpath("//em[@class='tb-rmb-num']/text()").extract_first() nick = selector.xpath('//div[@class="tb-shop-info-wrap"]/div/div[@class="tb-shop-seller"]//a[@class="tb-seller-name"]/text()').extract_first() nick_tmc = selector.xpath('//a[@class="slogo-shopname"]/strong/text()').extract_first() nick = nick_tmc if not nick and nick_tmc else nick shop_type ='B' if nick_tmc else 'C' if nick: nick = nick.replace('\n','').strip() else: #企业店铺店铺等级信息需要单独获取 nick_list = re.findall('''[^\w]*sellerNick\s*:\s*['"](.+)['"]''',response.body) if not nick_list: return nick = urllib.unquote_plus(nick_list[0]).decode('gbk') if nick_list else nick #比如用户是3皇冠,知道是3,但是不知道是皇冠还是钻,因为图片是在css属性中的background中 level_type = selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]/@class').extract_first() level_num = len(selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]//i')) rate_url = selector.xpath('//div[@class="tb-shop-info-hd"]/div[2]//a/@href').extract_first() ##price_url like https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=559466494297&sellerId=96216586&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract price_url_list = re.findall('''[^\w]*wholeSibUrl\s*:\s*['"](.+)['"]''',response.body) count_url_list = re.findall('''[^\w]*counterApi\s*:\s*['"](.+)['"]''',response.body) #企业店铺延迟加载 shop_info_url_list = re.findall('''[^\w]*api\s*:\s*['"](.+alicdn\.com/asyn\.htm.*)['"]''',response.body) user_id = selector.xpath('//div[@id="J_Pine"]/@data-sellerid').extract_first() #c店 if shop_type == 'B': num_iid = selector.xpath('//div[@id="LineZing"]/@itemid').extract_first() sid = selector.xpath('//div[@id="LineZing"]/@shopid').extract_first() cid_str = selector.xpath('//div[@id="J_ZebraPriceDesc"]/@mdv-cfg').extract_first() m = re.findall('catId:(\d+)',cid_str) if not m: return cid = m[0] if m else '' js_data = re.findall('<\s*script[^>]*>[^<]*TShop.Setup\(([^<]*?)\)[^<]*<\s*/\s*script\s*',item_rsp_text) js_data = json.loads(js_data[0]) if js_data else {} user_id = js_data['itemDO']['userId'] cid = js_data['itemDO']['categoryId'] shop_info_url_list = [js_data['api']['fetchDcUrl']] price = js_data['detail']['defaultItemPrice'] count_url_list = [js_data['apiBeans']] else: num_iid = selector.xpath('//div[@id="J_Pine"]/@data-itemid').extract_first() sid = selector.xpath('//div[@id="J_Pine"]/@data-shopid').extract_first() cid = selector.xpath('//div[@id="J_Pine"]/@data-catid').extract_first() detail_common_url = 'https://rate.taobao.com/detailCommon.htm?auctionNumId=%s&userNumId=%s' %(num_iid,user_id) + '&ua=098%23E1hvopvUvbpvUpCkvvvvvjiPPLFpsjDCPFSwsjthPmPh6j3CP2ShljnCPLShlj3UR4wCvvpvvUmmmphvLCCwXQvjOezOafmAdcOdYExrt8g7EcqyaNoxdB%2BaWXxrzjZcR2xVI4mxfXAK4Z7xfa3l5dUf85xr1jZ7%2B3%2BuaNLXSfpAOHmQD7zydiTtvpvIvvvvvhCvvvvvvUnvphvUivvv96CvpC29vvm2phCvhhvvvUnUphvp98yCvv9vvUvQ0%2FCUhOyCvvOWvvVvaZUCvpvVvmvvvhCv2QhvCPMMvvvtvpvhvvvvvv%3D%3D&callback=json_tbc_rate_summary' detail_common_rsp = RequestHandle.get_json_rsp(detail_common_url) detail_common_dict = ResponseTool.unpack_jsonp(detail_common_rsp.text) if price_url_list: price_url = 'https:'+ price_url_list[0] #yield scrapy.Request(url=price_url,meta = {'data':doc,'cookiejar':1},callback = self.parse_price) count_dict = {} if count_url_list: count_url= 'https:'+count_url_list[0] + '&callback=jsonp109' count_rsp = RequestHandle.get_json_rsp(count_url) try: count_dict = ResponseTool.unpack_jsonp(count_rsp.text) except Exception,e: m_count = re.findall('ICCP_1_%s":(\d*)' % num_iid, count_rsp.text) if m_count: count_dict = {'ICCP_1_%s' % num_iid:int(m_count[0])} #企业店铺,店铺信息异步加载 prom_price = None if 'tb-shop-info-wrap' not in response.body and title and nick and shop_info_url_list: shop_info_url = 'https:' + shop_info_url_list[0] shop_text = RequestHandle.get_rsp(shop_info_url).text.replace('\\r\\n','').replace('\\"','"').replace("\\'","'") m = re.findall('(<div class="tb-shop".*)',shop_text) if m: page = lhtml.document_fromstring(m[0]) #if shop_type == 'B': # page = lhtml.document_fromstring(shop_text) # prom_price_list = page.xpath('//p[@class="price"]/span/text()') # prom_price = prom_price_list[0] if prom_price_list else None if m: level_num = len(page.xpath('//div[@class="shop-rank-wrap"]/span/a/i')) level_type_obj = page.xpath('//div[@class="shop-rank-wrap"]/span/a') level_type = level_type_obj[0].get('class') if level_type_obj else level_type page.xpath('//p[@class="price"]/span/text()') level = LevelTool.get_level(level_num,level_type) #print '===item==',nick,level_num,title,rate_url,price_url_list doc = {'nick':nick,'level':level,'item_title':title,'origin_price':price} if response.meta.get('data'): doc.update(response.meta.get('data')) doc.update({'user_id':int(user_id),'num_iid':int(num_iid),'sid':int(sid),'cid':int(cid),'common_detail':detail_common_dict,'count_detail':count_dict}) data_obj.update(doc) yield data_obj
def process_request(self, request, spider): if self.user_agent: user_agent = self.get_random_ua() logger.info(user_agent) request.headers.setdefault(b'User-Agent', user_agent)
def parse(self, response): item = ArticleItem() sel = scrapy.Selector(response) catalog = "" title = join(sel.xpath("//h1[@class='art_tit']/text()").extract()) source = "vgtime" art_url = "" # 物理存放地址 auther = join( sel.xpath("//div[@class='editor_name']/span[1]/text()").extract()) create_time = time.altzone artTime = join(sel.xpath("//span[@class='time_box']/text()").extract()) artFromUrl = response.url artImageUrl = "" # 来list同级下的展示图片下载地址 allhtml = response.body # content内容摘要信息 isString = False isImage = False isVideo = False # 解析具体内容到 list中 conList = [] com = sel.xpath("//div[@class='topicContent front_content']/*") '''有序抓取''' for index, content in enumerate(com): # 拿到所有内容 # logger.info("spider!!!=" + content.extract() + index.__str__()) pSel = "//div[@class='topicContent front_content']/p[" + ( index + 1).__str__() + "]" imgMvSel = "//div[@class='topicContent front_content']/div[" + ( index + 1).__str__() + "]/figure" # 图片和视频,根据figure后的标签判断是图片还是视频 pCon = sel.xpath(pSel) ivCon = sel.xpath(imgMvSel) # logger.info("spider!!!=" + ivCon.extract().__str__() + index.__str__()) if len(pCon.extract()) > 0: # 文字 pText = pCon.xpath('string(.)').extract() conList.append(join(pText)) isString = True # logger.info("text!!!=" + pText.__str__() + index.__str__()) if len(ivCon.extract()) > 0: # 图片和视频 imgCon = ivCon.xpath("img") mvCon = ivCon.xpath("embed") mvCon_iframe = ivCon.xpath("iframe") if len(imgCon) > 0: # 图片 imgUrl = imgCon.xpath("@src").extract() imgWidth = imgCon.xpath("@style").re("[1-9]\d*") # px type = Constant.typeImg imgObj = { "src": join(imgUrl), "width": join(imgWidth), "type": str(type) } conList.append(imgObj) isImage = True # logger.info("image_!!!=" + imgObj.__str__()) if len(ivCon.xpath("figcation")) > 0: # 图片下方介绍文字 imgDes = ivCon.xpath("figcation/text()").extract() conList.append(join(imgDes)) logger.info("image_!!!=" + imgDes) elif len(mvCon.extract()) > 0: # 视频 mvUrl = mvCon.xpath("@src").extract() mvParams = mvCon.xpath("@flashvars").extract() mvUrl.append(mvParams) # 这里是 url+params拼接 mvWidth = mvCon.xpath("@width").extract() mvHeight = mvCon.xpath("@height").extract() type = Constant.typeVideo mvObj = { "src": join(mvUrl), "type": str(type), "width": join(mvWidth), "height": join(mvHeight) } conList.append(mvObj) # logger.info("image_!!!=" + mvObj.__str__()) elif len(mvCon_iframe.extract()) > 0: # 视频-iframe mvUrl = mvCon_iframe.xpath("@src").extract() mvWidth = "-1" mvHeight = "-1" if len(mvCon_iframe.xpath("@width").extract()): mvWidth = mvCon_iframe.xpath("@width").extract() if len(mvCon_iframe.xpath("@height").extract()) > 0: mvHeight = mvCon_iframe.xpath("@height").extract() type = Constant.typeVideo mvObj = { "src": join(mvUrl), "type": str(type), "width": join(mvWidth), "height": join(mvHeight) } conList.append(mvObj) isVideo = True # logger.info("image_!!!=" + mvObj.__str__()) # logger.info("spider!!!=" + conList.__str__()) # logger.info("spider!!!=" + conList.__len__().__str__()) # 构造返回item item['title'] = title item['auther'] = auther item['source'] = source item['catalog'] = catalog item['artUrl'] = art_url item['createTime'] = create_time item['artTime'] = artTime item['artFromUrl'] = artFromUrl item['artImageUrl'] = artImageUrl item['content'] = conList # item['xpathTag'] = allhtml item['isString'] = str(isString) item['isImage'] = str(isImage) item['isVideo'] = str(isVideo) yield item
def process_exception(self, request, exception, spider): self.proxy = get_proxy() request.meta['proxy'] = "http://" + self.proxy logger.info("process_exception eval! %s" % request.meta) return request