def parse_leftsecond(leftsecond_node): max_leftsec_num = get_max_num('leftsec') if max_leftsec_num is None: max_leftsec_num = 0 curr_num_of_lesec = max_leftsec_num + 1 nbanews = NbaNews() leftsecsel = Selector( text=str(leftsecond_node), type="html", ) leftsecimgsrcurl = leftsecsel.xpath('//img//@src').extract()[0].strip() leftsecfile_name = "leftsec_%s.jpg" % curr_num_of_lesec leftsecfile_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\sinasports", leftsecfile_name) urllib.request.urlretrieve(leftsecimgsrcurl, leftsecfile_path) nbanews["number"] = curr_num_of_lesec #curr_num_of_lesec = curr_num_of_lesec + 1 nbanews["imgsrcurl"] = "../static/img/sinasports/%s" % leftsecfile_name nbanews["imgurl"] = None nbanews["isvideo"] = None nbanews["title"] = None nbanews["titleurl"] = None nbanews["newstime"] = None nbanews["comment_url"] = None for j in range(1, 6): nbanews["tag%s" % j] = None nbanews["tag%surl" % j] = None nbanews["newstype"] = 'leftsec' return nbanews
def parse_zhihuhot_comment(response,hotid): max_comuserimg_num = get_max_num('zhihuhotcomments') if max_comuserimg_num is None: max_comuserimg_num = 0 curr_num_of_comuser = max_comuserimg_num + 1 #resultjson = json.loads(response) resultjson = response comments = resultjson['data'] comment_item = ZhihuHotComment() for comment in comments: commentid = comment['id'] author = comment['author'] author_member = author['member'] userimgsrcurl = author_member['avatar_url'] url_token = author_member['url_token'] userimgurl = '//www.zhihu.com/people/' + url_token username = author_member['name'] replytime = comment['created_time'] replytime = datetime.datetime.fromtimestamp(replytime) content = comment['content'] infavorqty = comment['vote_count'] replytouser = None replytouserurl = None if "reply_to_author" in comment.keys(): reply_to_author = comment['reply_to_author'] if reply_to_author is not None: reply_to_author_member = reply_to_author['member'] replytouser = reply_to_author_member['name'] replytouser_urltoken = reply_to_author_member['url_token'] replytouserurl = '//www.zhihu.com/people/' + replytouser_urltoken file_name = "zhihuhotcomuser_%s.jpg" % curr_num_of_comuser file_path = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name) '''proxies_list = [{'http': '121.193.143.249:80'}, {'http': '192.168.1.100:80'}] proxies = random.choice(proxies_list) proxy_handler = urllib.request.ProxyHandler(proxies) opener = urllib.request.build_opener(proxy_handler) data = opener.open(userimgsrcurl).read()''' data = s.get(userimgsrcurl, headers=agentheaders).content with open(file_path, "wb") as code: code.write(data) #urllib.request.urlretrieve(userimgsrcurl, file_path) comment_item["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name comment_item["userimgnumber"] = curr_num_of_comuser curr_num_of_comuser = curr_num_of_comuser + 1 comment_item["commentid"] = commentid comment_item["hotid"] = hotid comment_item["userimgurl"] = userimgurl comment_item["username"] = username comment_item["replytouser"] = replytouser comment_item["replytouserurl"] = replytouserurl comment_item["replytime"] = replytime comment_item["content"] = content comment_item["infavorqty"] = infavorqty if duplicate_record.process_item(comment_item) is not None: if redis_deduplicate.process_item(comment_item) is not None: inserttomysql.process_item(comment_item)
def parse_zhihuhot_content(response,hotid,hottype): zhihuhot_content = ZhihuHotContent() soup = BeautifulSoup(response, 'lxml') if hottype == 'question': post_node = soup.select("span[class='RichText ztext CopyrightRichText-richText']") else: post_node = soup.select("div[class='RichText ztext Post-RichText']") d = pq(str(post_node[0])) children = list(d.children()) max_contentimg_num = get_max_num('zhihucontent') if max_contentimg_num is None: max_contentimg_num = 0 curr_num_of_content = max_contentimg_num + 1 for i in range(len(children)): part_str = None imgurl = None imgnumber = None videourl = None '''parttype = None if children[i].tag == 'p' or children[i].tag == 'div' or children[i].tag == 'blockquote' or children[i].tag == 'ul' or children[i].tag == 'hr': part_str = recursive(children[i]) parttype = 'text' ''' if children[i].tag == 'a': videourl, part_str = get_videourl(children[i]) if videourl is not None: parttype = 'video' else: parttype = 'text' elif children[i].tag == 'figure': imgurl, imgnumber = get_img_info(children[i], curr_num_of_content, s, agentheaders) curr_num_of_content += 1 parttype = 'img' else: part_str = recursive(children[i]) parttype = 'text' if parttype is not None: zhihuhot_content['hotid'] = hotid zhihuhot_content['partno'] = i + 1 zhihuhot_content['parttype'] = parttype zhihuhot_content['imgurl'] = imgurl zhihuhot_content['imgnumber'] = imgnumber zhihuhot_content['videourl'] = videourl zhihuhot_content['text'] = part_str if duplicate_record.process_item(zhihuhot_content) is not None: if redis_deduplicate.process_item(zhihuhot_content) is not None: inserttomysql.process_item(zhihuhot_content)
def parse_lefttop(lefttopimg_node): max_lefttop_num = get_max_num('lefttop') if max_lefttop_num is None: max_lefttop_num = 0 curr_num_of_letop = max_lefttop_num + 1 nbanews = NbaNews() lefttopsel = Selector( text=str(lefttopimg_node), type="html", ) lefttoptitle = lefttopsel.xpath('//h3/text()').extract()[0].strip() lefttopurl = lefttopsel.xpath('//a//@href').extract()[0].strip() lefttopimgsrcurl = lefttopsel.xpath('//img//@src').extract()[0].strip() lefttopisvideo = lefttopurl[2:7] if lefttopisvideo == 'video': lefttopisvideo = 'TRUE' else: lefttopisvideo = 'FALSE' lefttopfile_name = "lefttop_%s.jpg" % curr_num_of_letop lefttopfile_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\sinasports", lefttopfile_name) urllib.request.urlretrieve(lefttopimgsrcurl, lefttopfile_path) nbanews["number"] = curr_num_of_letop #curr_num_of_letop = curr_num_of_letop + 1 nbanews["imgsrcurl"] = "../static/img/sinasports/%s" % lefttopfile_name nbanews["imgurl"] = lefttopurl nbanews["isvideo"] = lefttopisvideo nbanews["title"] = lefttoptitle nbanews["titleurl"] = None nbanews["newstime"] = None nbanews["comment_url"] = None for j in range(1, 6): nbanews["tag%s" % j] = None nbanews["tag%surl" % j] = None nbanews["newstype"] = 'lefttop' return nbanews
class CsdnSpider(scrapy.Spider): name = 'csdn' allowed_domains = ["www.csdn.net"] start_urls = ['http://www.csdn.net/'] index_url = 'http://www.csdn.net/' more_artice_url = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset={shown_offset}' max_index_news_num = get_max_num('index_news') if max_index_news_num is None: max_index_news_num = 0 curr_num_of_article = max_index_news_num + 1 max_car_number = get_max_num('index_car', 'Carousel') if max_car_number is None: max_car_number = 0 curr_num_of_car = max_car_number + 1 max_car_r_number = get_max_num('index_car', 'Carousel_R') if max_car_r_number is None: max_car_r_number = 0 curr_num_of_car_r = max_car_r_number + 1 max_right_number = get_max_num('index_car', 'Right') if max_right_number is None: max_right_number = 0 curr_num_of_right = max_right_number + 1 def start_requests(self): yield Request(self.index_url, callback=self.parse_index) for i in range(1, 5): yield Request(self.more_artice_url.format(shown_offset=21 + (i - 1) * 10), callback=self.parse_more_index_art) def parse_index(self, response): carousel_item = IndexCarouselItem() index_news_item = IndexNews() soup = BeautifulSoup(response.text, 'lxml') post_nodes = soup.select(".carousel-inner .csdn-tracking-statistics") post_nodes1 = soup.select(".carousel-right .carousel-right-u") post_nodes2 = soup.select(".company_list li") post_nodes3 = soup.select( ".feedlist_mod li[class='clearfix'] div[class='list_con']") for post_node in post_nodes: sel = Selector( text=str(post_node), type="html", ) title = sel.xpath('//div[@class="carousel-caption"]/text()' ).extract()[0].strip() url = sel.xpath('//a//@href').extract()[0].strip() img_url = sel.xpath('//img//@src').extract()[0].strip() file_name = "carousel_%s.jpg" % (self.curr_num_of_car) file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\csdn", file_name) urllib.request.urlretrieve(img_url, file_path) carousel_item["number"] = self.curr_num_of_car self.curr_num_of_car = self.curr_num_of_car + 1 carousel_item["title"] = title carousel_item["url"] = url #carousel_item["img_url"] = img_url carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name) carousel_item["item_class"] = "Carousel" yield carousel_item for post_node1 in post_nodes1: sel1 = Selector( text=str(post_node1), type="html", ) title1 = sel1.xpath( '//p[@class="carousel-right-caption"]/span/text()').extract( )[0].strip() url1 = sel1.xpath('//a//@href').extract()[0].strip() img_url1 = sel1.xpath('//img//@src').extract()[0].strip() file_name1 = "carousel_right_%s.jpg" % (self.curr_num_of_car_r) file_path1 = os.path.join( "D:\StefanClub\StefanClub\www\static\img\csdn", file_name1) urllib.request.urlretrieve(img_url1, file_path1) carousel_item["number"] = self.curr_num_of_car_r self.curr_num_of_car_r = self.curr_num_of_car_r + 1 carousel_item["title"] = title1 carousel_item["url"] = url1 #carousel_item["img_url"] = img_url carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name1) carousel_item["item_class"] = "Carousel_R" yield carousel_item for post_node2 in post_nodes2: sel2 = Selector( text=str(post_node2), type="html", ) title2 = sel2.xpath('//h3/a/text()').extract()[0].strip() url2 = sel2.xpath('//h3/a//@href').extract()[0].strip() img_url2 = sel2.xpath('//img//@src').extract()[0].strip() file_name2 = "right_%s.jpg" % (self.curr_num_of_right) file_path2 = os.path.join( "D:\StefanClub\StefanClub\www\static\img\csdn", file_name2) urllib.request.urlretrieve(img_url2, file_path2) carousel_item["number"] = self.curr_num_of_right self.curr_num_of_right = self.curr_num_of_right + 1 carousel_item["title"] = title2 carousel_item["url"] = url2 # carousel_item["img_url"] = img_url carousel_item["img_url"] = "../static/img/csdn/%s" % (file_name2) carousel_item["item_class"] = "Right" yield carousel_item for post_node3 in post_nodes3: sel3 = Selector( text=str(post_node3), type="html", ) index_news_item["close_target_id"] = "myModal_%s" % ( self.curr_num_of_article) index_news_item["close_target_id_ref"] = "#myModal_%s" % ( self.curr_num_of_article) title3 = sel3.xpath( '//div[@class="title"]/h2/a/text()').extract()[0].strip() url3 = sel3.xpath( '//div[@class="title"]/h2/a//@href').extract()[0].strip() news_summary = sel3.xpath( '//div[@class="summary oneline"]/text()').extract()[0].strip() user_url = sel3.xpath('//dt/a//@href').extract()[0].strip() user_img_url = sel3.xpath('//dt/a/img//@src').extract()[0].strip() user_name = sel3.xpath( '//dd[@class="name"]/a/text()').extract()[0].strip() news_date = sel3.xpath( '//dd[@class="time"]/text()').extract()[0].strip() label_url = '' news_label = '' news_reads = '0' news_comments = '0' label_list = sel3.xpath('//dd[@class="tag"]/a//@href') if len(label_list) > 0: label_url = sel3.xpath( '//dd[@class="tag"]/a//@href').extract()[0].strip() label_list2 = sel3.xpath('//dd[@class="tag"]/a/text()') if len(label_list2) > 0: news_label = sel3.xpath( '//dd[@class="tag"]/a/text()').extract()[0].strip() reads_num_list = sel3.xpath( '//dd[@class="read_num"]/a/span[@class="num"]/text()') if len(reads_num_list) > 0: news_reads = sel3.xpath( '//dd[@class="read_num"]/a/span[@class="num"]/text()' ).extract()[0].strip() comment_url = sel3.xpath( '//dd[@class="common_num "]/a//@href').extract()[0].strip() comment_num_list = sel3.xpath( '//dd[@class="common_num "]/a/span[@class="num"]/text()') if len(comment_num_list) > 0: news_comments = sel3.xpath( '//dd[@class="common_num "]/a/span[@class="num"]/text()' ).extract()[0].strip() file_name3 = "userimg_%s.jpg" % (self.curr_num_of_article) index_news_item["number"] = self.curr_num_of_article self.curr_num_of_article = self.curr_num_of_article + 1 file_path3 = os.path.join( "D:\StefanClub\StefanClub\www\static\img\csdn", file_name3) urllib.request.urlretrieve(user_img_url, file_path3) index_news_item["title"] = title3 index_news_item["url"] = url3 index_news_item["news_summary"] = news_summary index_news_item["user_img_url"] = "../static/img/csdn/%s" % ( file_name3) index_news_item["user_name"] = user_name index_news_item["user_url"] = user_url index_news_item["news_date"] = news_date index_news_item["label_url"] = label_url index_news_item["news_label"] = news_label index_news_item["news_reads"] = int(news_reads) index_news_item["comment_url"] = comment_url index_news_item["news_comments"] = int(news_comments) yield index_news_item def parse_more_index_art(self, response): resultjson = json.loads(response.body) articles = resultjson['articles'] index_news_item = IndexNews() for article in articles: title = article['title'] url = article['url'] news_summary = article['summary'] user_img_url = article['avatar'] close_target_id = "myModal_%s" % (self.curr_num_of_article) close_target_id_ref = "#myModal_%s" % (self.curr_num_of_article) file_name = "userimg_%s.jpg" % (self.curr_num_of_article) index_news_item["number"] = self.curr_num_of_article self.curr_num_of_article = self.curr_num_of_article + 1 file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\csdn", file_name) urllib.request.urlretrieve(user_img_url, file_path) user_name = article['user_name'] user_url = article['user_url'] news_date = article['created_at'] news_label = article['category'] label_url = "/nav/%s" % (article['category_id']) news_reads = article['views'] news_comments = article['comments'] comment_url = "%s#comment_form" % (article['url']) index_news_item["close_target_id"] = close_target_id index_news_item["close_target_id_ref"] = close_target_id_ref index_news_item["title"] = title index_news_item["url"] = url index_news_item["news_summary"] = news_summary index_news_item["user_img_url"] = "../static/img/csdn/%s" % ( file_name) index_news_item["user_name"] = user_name index_news_item["user_url"] = user_url index_news_item["news_date"] = news_date index_news_item["label_url"] = label_url index_news_item["news_label"] = news_label index_news_item["news_reads"] = news_reads index_news_item["comment_url"] = comment_url index_news_item["news_comments"] = news_comments yield index_news_item
def parse_main(response): max_userimg_num = get_max_num('zhihuhotuser') if max_userimg_num is None: max_userimg_num = 0 curr_num_of_usim = max_userimg_num + 1 max_newsimg_num = get_max_num('zhihuhotnews') if max_newsimg_num is None: max_newsimg_num = 0 curr_num_of_neim = max_newsimg_num + 1 zhihuhot = ZhihuHot() soup = BeautifulSoup(response, 'lxml') post_nodes = soup.select("div[class='Card TopstoryItem TopstoryItem-isRecommend']") for post_node in post_nodes: sel = Selector(text=str(post_node), type="html", ) '''feedsourceurl = sel.xpath('//a[@class="TopicLink"]//@href').extract()[0].strip() feedsourcetags = sel.xpath('//div[@aria-haspopup="true"]/text()') if len(feedsourcetags) > 0: feedsourcetag = sel.xpath('//div[@aria-haspopup="true"]/text()').extract()[0].strip() else: feedsourcetag = None userimgsrcurl = sel.xpath('//img[@class="Avatar AuthorInfo-avatar"]//@src').extract()[0].strip() userimgurls = sel.xpath('//a[@class="UserLink-link"]//@href') if len(userimgurls) > 0: userimgurl = sel.xpath('//a[@class="UserLink-link"]//@href').extract()[0].strip() else: userimgurl = None usernames1 = sel.xpath('//a[@class="UserLink-link"]/text()') usernames2 = sel.xpath('//span[@class="UserLink AuthorInfo-name"]/text()') if len(usernames1) > 0: username = sel.xpath('//a[@class="UserLink-link"]/text()').extract()[0].strip() elif len(usernames2) > 0: username = sel.xpath('//span[@class="UserLink AuthorInfo-name"]/text()').extract()[0].strip() else: username = None userinfolist = sel.xpath('//div[@class="AuthorInfo-detail"]/div/div/text()') if len(userinfolist) > 0: userinfo = sel.xpath('//div[@class="AuthorInfo-detail"]/div/div/text()').extract()[0].strip() else: userinfo = None''' # zhihu has removed the feedsource and authorinfo newsimg = sel.xpath('//div[@class="RichContent-cover-inner"]/img//@src') newsimg2 = sel.xpath('//div[@class="RichContent-cover-inner"]/div//@data-src') if len(newsimg) > 0: newsimgsrcurl = sel.xpath('//div[@class="RichContent-cover-inner"]/img//@src').extract()[0].strip() elif len(newsimg2) > 0: newsimgsrcurl = sel.xpath('//div[@class="RichContent-cover-inner"]/div//@data-src').extract()[0].strip() else: newsimgsrcurl = None if newsimgsrcurl is None: zhihuhot["newsimgsrcurl"] = None zhihuhot["newsimgnumber"] = None else: file_name1 = "zhihuhotnews_%s.jpg" % curr_num_of_neim file_path1 = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name1) urllib.request.urlretrieve(newsimgsrcurl, file_path1) zhihuhot["newsimgsrcurl"] = "../static/img/zhihu/%s" % file_name1 zhihuhot["newsimgnumber"] = curr_num_of_neim curr_num_of_neim = curr_num_of_neim + 1 hasvideo = sel.xpath('//div[@class="RichContent-cover-play"]') if len(hasvideo) > 0: isvideo = 'TRUE' else: isvideo = 'FALSE' title1 = sel.xpath('//h2[@class="ContentItem-title"]/div/a') title2 = sel.xpath('//h2[@class="ContentItem-title"]/a') if len(title1) > 0: title = sel.xpath('//h2[@class="ContentItem-title"]/div/a/text()').extract()[0].strip() titleurl = sel.xpath('//h2[@class="ContentItem-title"]/div/a//@href').extract()[0].strip() elif len(title2) > 0: title = sel.xpath('//h2[@class="ContentItem-title"]/a/text()').extract()[0].strip() titleurl = sel.xpath('//h2[@class="ContentItem-title"]/a//@href').extract()[0].strip() else: title = 'Empty title,It will be dropped by redis control except the first one' titleurl = None hottype = None if titleurl is not None: if titleurl[1:9] == 'question': titleurl = '//www.zhihu.com' + titleurl hottype = 'question' if titleurl[2:10] == 'zhuanlan': hottype = 'zhuanlan' hotid = None if titleurl is not None: hotid = get_zhihu_hotid(titleurl) newscontent = sel.xpath('//span[@class="RichText ztext CopyrightRichText-richText"]/text()').extract()[0].strip() infavorqty1 = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()').extract()[0].strip() infavorqty2 = '' infavorqty2list = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()') if len(infavorqty2list) > 1: infavorqty2 = sel.xpath('//button[@class="Button VoteButton VoteButton--up"]/text()').extract()[1].strip() infavorqty = infavorqty1 + infavorqty2 infavorqty_list = list(infavorqty) infavorqty_list.insert(2, " ") infavorqty = "".join(infavorqty_list) comment_title = sel.xpath('//button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]/text()').extract()[0].strip() comment_qty = get_comment_qty(comment_title) comment_page = comment_qty // 20 + (1 if comment_qty % 20 > 0 else 0) '''file_name = "zhihuhotuser_%s.jpg" % curr_num_of_usim file_path = os.path.join("D:\StefanClub\StefanClub\www\static\img\zhihu", file_name) urllib.request.urlretrieve(userimgsrcurl, file_path) zhihuhot["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name zhihuhot["userimgnumber"] = curr_num_of_usim curr_num_of_usim = curr_num_of_usim + 1''' # zhihu has removed the feedsource and authorinfo zhihuhot["userimgsrcurl"] = None# zhihu has removed the feedsource and authorinfo zhihuhot["userimgnumber"] = None# zhihu has removed the feedsource and authorinfo zhihuhot["feedsourcetag"] = None # feedsourcetag zhihuhot["feedsourceurl"] = None #feedsourceurl zhihuhot["userimgurl"] = None #userimgurl zhihuhot["username"] = None #username zhihuhot["userinfo"] = None #userinfo zhihuhot["newsimgurl"] = None zhihuhot["isvideo"] = isvideo zhihuhot["title"] = title zhihuhot["titleurl"] = titleurl zhihuhot["hotid"] = hotid zhihuhot["newscontent"] = newscontent zhihuhot["infavorqty"] = infavorqty zhihuhot["comment_url"] = None zhihuhot["comment_title"] = comment_title zhihuhot["share_url"] = None if hotid is not None: if duplicate_record.process_item(zhihuhot) is not None: if redis_deduplicate.process_item(zhihuhot) is not None: if inserttomysql.process_item(zhihuhot) == 'toinsert': if hottype == 'question': for i in range(comment_page): # html = s.get(answer_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders).text html = s.get(answer_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders) #result = json.dumps(html.json(), ensure_ascii=False) result = html.json() # parse_zhihuhot_comment(html,hotid) parse_zhihuhot_comment(result, hotid) elif hottype == 'zhuanlan': for i in range(comment_page): html = s.get(zhuanlan_comment_url.format(hotid=hotid, offset=i * 20), headers=jsonheaders) result = html.json() parse_zhihuhot_comment(result, hotid) contenturl = 'https:' + titleurl mainhtml = s.get(contenturl).text parse_zhihuhot_content(mainhtml,hotid,hottype)
class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['https://www.zhihu.com'] zhuanlan_comment_url = 'https://www.zhihu.com/api/v4/articles/{hotid}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author%2Calgorithm_right&order=normal&limit=20&offset={offset}&status=open' answer_comment_url = 'https://www.zhihu.com/api/v4/answers/{hotid}/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author%2Calgorithm_right&order=normal&limit=20&offset={offset}&status=open' # headers = {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",} headers = {} max_userimg_num = get_max_num('zhihuhotuser') if max_userimg_num is None: max_userimg_num = 0 curr_num_of_usim = max_userimg_num + 1 max_newsimg_num = get_max_num('zhihuhotnews') if max_newsimg_num is None: max_newsimg_num = 0 curr_num_of_neim = max_newsimg_num + 1 max_comuserimg_num = get_max_num('zhihuhotcomments') if max_comuserimg_num is None: max_comuserimg_num = 0 curr_num_of_comuser = max_comuserimg_num + 1 def start_requests(self): #display = Display(visible=0, size=(800, 600)) #display.start() s = requests.Session() s.headers.clear() browser = webdriver.Chrome() '''browser2 = webdriver.Firefox() browser2.get("https://www.zhihu.com/signin") browser2.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("13818248346") browser2.find_element_by_css_selector(".SignFlow-password input").send_keys("kaihua1010") browser2.find_element_by_css_selector(".Button.SignFlow-submitButton").click() time.sleep(5) Cookies2 = browser2.get_cookies()''' browser.get("https://www.zhihu.com/signin") browser.find_element_by_css_selector( ".SignFlow-accountInput.Input-wrapper input").send_keys( "13818248346") browser.find_element_by_css_selector( ".SignFlow-password input").send_keys("kaihua1010") browser.find_element_by_css_selector( ".Button.SignFlow-submitButton").click() time.sleep(10) Cookies = browser.get_cookies() cookie_dict = {} '''cookie = [item["name"] + "=" + item["value"] for item in Cookies] cookiestr = '; '.join(item for item in cookie) self.headers['cookie'] = cookiestr''' for cookie in Cookies: cookie_dict[cookie['name']] = cookie['value'] s.cookies.set(cookie['name'], cookie['value']) browser.close() html2 = s.get( 'https://www.zhihu.com/question/263892920/answer/405697336').text print(html2) html3 = s.get(self.start_urls[0]).text print(html3) self.headers = s.headers #display.stop() '''cookie_dict = {} cookie_dict['_xsrf'] = 'a1807a2e-e8da-4464-bcc0-b11be259f42b' cookie_dict['_zap'] = '1e5ccdb9-7860-466f-9a59-ff9cb19e072d' cookie_dict['d_c0'] = 'ALAnFykIXg6PTtXBbHxOihH1-UmQKy8guOQ=|1539608541' cookie_dict['capsion_ticket'] = '2|1:0|10:1539790364|14:capsion_ticket|44:NWIxZTRlYjUxOTg3NGI5MjgwODBhZjYwNmEwNTFhYTI=|4c1810d7fdf17461da2dc94a82756002cab3899386af9fef230ec3efd8f410f9' cookie_dict['z_c0'] = '2|1:0|10:1539790372|4:z_c0|92:Mi4xV2ZvX0F3QUFBQUFBc0NjWEtRaGVEaVlBQUFCZ0FsVk5KS1MwWEFEXy0wZTFkX1I3SjhwTlRTSnUxSDRQajhfcHdR|a100b4ed646d6454be56a0be8f3257fddb1ead83eabe52056221759123a6d005' cookie_dict['q_c1'] = '1d154dfb0c3c49b5afcdf6c78fc70148|1539608544000|1539608544000' cookie_dict['tgw_l7_route'] = '170010e948f1b2a2d4c7f3737c85e98c' cookie_dict['__utma'] = '51854390.302433648.1539609652.1539609652.1539609652.1' cookie_dict['__utmc'] = '51854390' cookie_dict['__utmz'] = '51854390.1539609652.1.1.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/hot' cookie_dict['__utmv'] = '51854390.100--|2=registration_date=20160715=1^3=entry_date=20160715=1' cookie_dict['tst'] = 'r' cookie_dict['__gads'] = 'ID=fa1c97a341ad2943:T=1539696957:S=ALNI_MZJ7ws-b5ObURSAQlBAGi8pbTmD6g' ''' yield Request( url=self.start_urls[0], dont_filter=True, meta={"cookies": cookie_dict}, cookies=cookie_dict, callback=self.parse_main, ) #yield Request(url=self.start_urls[0], dont_filter=True, meta={"cookies": cookie_dict}, cookies=cookie_dict,callback=self.parse_main, ) #yield Request(url=self.start_urls[0], dont_filter=True, headers=self.headers, callback=self.parse_main) def parse_main(self, response): zhihuhot = ZhihuHot() cookie_dict = response.meta.get("cookies", "") soup = BeautifulSoup(response.text, 'lxml') post_nodes = soup.select("div[class='Card TopstoryItem']") for post_node in post_nodes: sel = Selector( text=str(post_node), type="html", ) feedsourceurl = sel.xpath( '//a[@class="TopicLink"]//@href').extract()[0].strip() feedsourcetags = sel.xpath('//div[@aria-haspopup="true"]/text()') if len(feedsourcetags) > 0: feedsourcetag = sel.xpath('//div[@aria-haspopup="true"]/text()' ).extract()[0].strip() else: feedsourcetag = None userimgsrcurl = sel.xpath( '//img[@class="Avatar AuthorInfo-avatar"]//@src').extract( )[0].strip() userimgurls = sel.xpath('//a[@class="UserLink-link"]//@href') if len(userimgurls) > 0: userimgurl = sel.xpath( '//a[@class="UserLink-link"]//@href').extract()[0].strip() else: userimgurl = None usernames1 = sel.xpath('//a[@class="UserLink-link"]/text()') usernames2 = sel.xpath( '//span[@class="UserLink AuthorInfo-name"]/text()') if len(usernames1) > 0: username = sel.xpath( '//a[@class="UserLink-link"]/text()').extract()[0].strip() elif len(usernames2) > 0: username = sel.xpath( '//span[@class="UserLink AuthorInfo-name"]/text()' ).extract()[0].strip() else: username = None userinfolist = sel.xpath( '//div[@class="AuthorInfo-detail"]/div/div/text()') if len(userinfolist) > 0: userinfo = sel.xpath( '//div[@class="AuthorInfo-detail"]/div/div/text()' ).extract()[0].strip() else: userinfo = None newsimg = sel.xpath( '//div[@class="RichContent-cover-inner"]/img//@src') newsimg2 = sel.xpath( '//div[@class="RichContent-cover-inner"]/div//@data-src') if len(newsimg) > 0: newsimgsrcurl = sel.xpath( '//div[@class="RichContent-cover-inner"]/img//@src' ).extract()[0].strip() elif len(newsimg2) > 0: newsimgsrcurl = sel.xpath( '//div[@class="RichContent-cover-inner"]/div//@data-src' ).extract()[0].strip() else: newsimgsrcurl = None if newsimgsrcurl is None: zhihuhot["newsimgsrcurl"] = None zhihuhot["newsimgnumber"] = None else: file_name1 = "zhihuhotnews_%s.jpg" % self.curr_num_of_neim file_path1 = os.path.join( "D:\StefanClub\StefanClub\www\static\img\zhihu", file_name1) urllib.request.urlretrieve(newsimgsrcurl, file_path1) zhihuhot[ "newsimgsrcurl"] = "../static/img/zhihu/%s" % file_name1 zhihuhot["newsimgnumber"] = self.curr_num_of_neim self.curr_num_of_neim = self.curr_num_of_neim + 1 hasvideo = sel.xpath('//div[@class="RichContent-cover-play"]') if len(hasvideo) > 0: isvideo = 'TRUE' else: isvideo = 'FALSE' title1 = sel.xpath('//h2[@class="ContentItem-title"]/div/a') title2 = sel.xpath('//h2[@class="ContentItem-title"]/a') if len(title1) > 0: title = sel.xpath( '//h2[@class="ContentItem-title"]/div/a/text()').extract( )[0].strip() titleurl = sel.xpath( '//h2[@class="ContentItem-title"]/div/a//@href').extract( )[0].strip() elif len(title2) > 0: title = sel.xpath('//h2[@class="ContentItem-title"]/a/text()' ).extract()[0].strip() titleurl = sel.xpath( '//h2[@class="ContentItem-title"]/a//@href').extract( )[0].strip() else: title = 'Empty title,It will be dropped by redis control except the first one' titleurl = None hottype = None if titleurl is not None: if titleurl[1:9] == 'question': titleurl = '//www.zhihu.com' + titleurl hottype = 'question' if titleurl[2:10] == 'zhuanlan': hottype = 'zhuanlan' hotid = None if titleurl is not None: hotid = get_zhihu_hotid(titleurl) newscontent = sel.xpath( '//span[@class="RichText ztext CopyrightRichText-richText"]/text()' ).extract()[0].strip() infavorqty1 = sel.xpath( '//button[@class="Button VoteButton VoteButton--up"]/text()' ).extract()[0].strip() infavorqty2 = sel.xpath( '//button[@class="Button VoteButton VoteButton--up"]/text()' ).extract()[1].strip() infavorqty = infavorqty1 + infavorqty2 infavorqty_list = list(infavorqty) infavorqty_list.insert(2, " ") infavorqty = "".join(infavorqty_list) comment_title = sel.xpath( '//button[@class="Button ContentItem-action Button--plain Button--withIcon Button--withLabel"]/text()' ).extract()[0].strip() comment_qty = get_comment_qty(comment_title) comment_page = comment_qty // 20 + (1 if comment_qty % 20 > 0 else 0) file_name = "zhihuhotuser_%s.jpg" % self.curr_num_of_usim file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\zhihu", file_name) urllib.request.urlretrieve(userimgsrcurl, file_path) zhihuhot["userimgsrcurl"] = "../static/img/zhihu/%s" % file_name zhihuhot["userimgnumber"] = self.curr_num_of_usim self.curr_num_of_usim = self.curr_num_of_usim + 1 zhihuhot["feedsourcetag"] = feedsourcetag zhihuhot["feedsourceurl"] = feedsourceurl zhihuhot["userimgurl"] = userimgurl zhihuhot["username"] = username zhihuhot["userinfo"] = userinfo zhihuhot["newsimgurl"] = None zhihuhot["isvideo"] = isvideo zhihuhot["title"] = title zhihuhot["titleurl"] = titleurl zhihuhot["hotid"] = hotid zhihuhot["newscontent"] = newscontent zhihuhot["infavorqty"] = infavorqty zhihuhot["comment_url"] = None zhihuhot["comment_title"] = comment_title zhihuhot["share_url"] = None if hotid is not None: yield zhihuhot if hottype == 'question': for i in range(comment_page): yield Request(url=self.answer_comment_url.format( hotid=hotid, offset=i * 20), meta={"hotid": hotid}, cookies=cookie_dict, callback=self.parse_zhihuhot_comment) elif hottype == 'zhuanlan': for i in range(comment_page): yield Request(url=self.zhuanlan_comment_url.format( hotid=hotid, offset=i * 20), meta={"hotid": hotid}, cookies=cookie_dict, callback=self.parse_zhihuhot_comment) yield Request(url=titleurl, meta={ "hotid": hotid, "hottype": hottype }, cookies=cookie_dict, callback=self.parse_zhihuhot_content) def parse_zhihuhot_comment(self, response): hotid = response.meta.get("hotid", "") resultjson = json.loads(response.body) comments = resultjson['data'] comment_item = ZhihuHotComment() for comment in comments: commentid = comment['id'] author = comment['author'] author_member = author['member'] userimgsrcurl = author_member['avatar_url'] url_token = author_member['url_token'] userimgurl = '//www.zhihu.com/people/' + url_token username = author_member['name'] replytime = comment['created_time'] replytime = datetime.datetime.fromtimestamp(replytime) content = comment['content'] infavorqty = comment['vote_count'] replytouser = None replytouserurl = None if "reply_to_author" in comment.keys(): reply_to_author = comment['reply_to_author'] if reply_to_author is not None: reply_to_author_member = reply_to_author['member'] replytouser = reply_to_author_member['name'] replytouser_urltoken = reply_to_author_member['url_token'] replytouserurl = '//www.zhihu.com/people/' + replytouser_urltoken file_name = "zhihuhotcomuser_%s.jpg" % self.curr_num_of_comuser file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\zhihu", file_name) urllib.request.urlretrieve(userimgsrcurl, file_path) comment_item[ "userimgsrcurl"] = "../static/img/zhihu/%s" % file_name comment_item["userimgnumber"] = self.curr_num_of_comuser self.curr_num_of_comuser = self.curr_num_of_comuser + 1 comment_item["commentid"] = commentid comment_item["hotid"] = hotid comment_item["userimgurl"] = userimgurl comment_item["username"] = username comment_item["replytouser"] = replytouser comment_item["replytouserurl"] = replytouserurl comment_item["replytime"] = replytime comment_item["content"] = content comment_item["infavorqty"] = infavorqty yield comment_item def parse_zhihuhot_content(self, response): hotid = response.meta.get("hotid", "") partno = 1 hottype = response.meta.get("hottype", "") zhihuhot_content = ZhihuHotContent() soup = BeautifulSoup(response.text, 'lxml') if hottype == 'question': post_node = soup.select("span[class='CopyrightRichText-richText']") else: post_node = soup.select("div[class='Post-RichText']") sel = Selector( text=str(post_node), type="html", ) test = sel.xpath('*')
class TaobaoSpider(scrapy.Spider): name = 'taobao' allowed_domains = ['www.taobao.com'] start_urls = ['http://www.taobao.com/'] keywords = ['UNIQLO', 'SUPERME', 'NIKE', 'ADIDAS', 'APPLE', 'HUAWEI'] #keywords = ['UNIQLO', 'SUPERME', 'NIKE', 'ADIDAS'] last_twelve_url = 'https://s.taobao.com/api?_ksTS=1537784279315_208&callback=jsonp209&ajax=true&m=customized&stats_click=search_radio_all:1&q={keyword}&s=36&imgfile=&initiative_id=staobaoz_20180924&bcoffset=0&js=1&ie=utf8&rn=91a38a1dc028b177e8b2f5d17a1f1e05' next_page_url = 'https://s.taobao.com/search?data-key=s&data-value={datavalue}&ajax=true&_ksTS=1537791664734_887&callback=jsonp888&q={keyword}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180920&ie=utf8&bcoffset=4&p4ppushleft=1%2C48' max_img_num = get_max_num('taobaoproduct') if max_img_num is None: max_img_num = 0 curr_num_of_img = max_img_num + 1 def start_requests(self): yield scrapy.Request('http://www.taobao.com/', callback=self.parse) def parse(self, response): for keyword in self.keywords: browser = webdriver.Chrome() browser.get('https://www.taobao.com/') browser.find_element_by_class_name( "search-combobox-input").send_keys(keyword) browser.find_element_by_class_name("btn-search").click() time.sleep(5) while browser.page_source.find('g_page_config') == -1: browser.refresh() page_source = browser.page_source browser.close() #yield self.parse_first_batch(page_source,keyword) g_page_config = page_source[ page_source.index('g_page_config = {') + 15:page_source.index('g_srp_loadCss()')].strip() g_page_config_json = json.loads(g_page_config[:-1]) modsfirst = g_page_config_json["mods"] itemlist_first = modsfirst["itemlist"] data_first = itemlist_first["data"] auctions_first = data_first["auctions"] for auction_first in auctions_first: taobaoproduct = parse_taobao_products(auction_first, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct last_twelve_response = requests.get( self.last_twelve_url.format(keyword=keyword)) while last_twelve_response.text == '': last_twelve_response = requests.get( self.last_twelve_url.format(keyword=keyword)) #yield self.parse_last_twelve(last_twelve_response.text,keyword) dict_response_last = last_twelve_response.text[11:] dict_response_last = dict_response_last[:-2] json_response_last = json.loads(dict_response_last) customizedapi = json_response_last['API.CustomizedApi'] itemlist_last = customizedapi['itemlist'] auctions_last = itemlist_last['auctions'] for auction_last in auctions_last: taobaoproduct = parse_taobao_products(auction_last, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct # get next page by click the nextlink # browser.find_elements_by_partial_link_text('下一页')[0].click() # next_page_source = browser.page_source for i in range(30): nextpage_response = requests.get( self.next_page_url.format(datavalue=44 * (i + 1), keyword=keyword)) while nextpage_response.text == '': nextpage_response = requests.get( self.next_page_url.format(datavalue=44 * (i + 1), keyword=keyword)) #yield self.parse_next_page(nextpage_response.text,keyword) dict_response_next = nextpage_response.text[11:] dict_response_next = dict_response_next[:-2] json_response_next = json.loads(dict_response_next) modsnext = json_response_next['mods'] itemlist_next = modsnext['itemlist'] data_next = itemlist_next['data'] auctions_next = data_next['auctions'] for auction_next in auctions_next: taobaoproduct = parse_taobao_products( auction_next, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct def parse_first_batch(self, page_source, keyword): g_page_config = page_source[page_source.index('g_page_config = {') + 15:page_source.index('g_srp_loadCss()' )].strip() g_page_config_json = json.loads(g_page_config[:-1]) mods = g_page_config_json["mods"] itemlist = mods["itemlist"] data = itemlist["data"] auctions = data["auctions"] for auction in auctions: taobaoproduct = parse_taobao_products(auction, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct def parse_last_twelve(self, response, keyword): dict_response = response[11:] dict_response = dict_response[:-2] json_response = json.loads(dict_response) customizedapi = json_response['API.CustomizedApi'] itemlist = customizedapi['itemlist'] auctions = itemlist['auctions'] for auction in auctions: taobaoproduct = parse_taobao_products(auction, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct def parse_next_page(self, response, keyword): dict_response = response[11:] dict_response = dict_response[:-2] json_response = json.loads(dict_response) mods = json_response['mods'] itemlist = mods['itemlist'] data = itemlist['data'] auctions = data['auctions'] for auction in auctions: taobaoproduct = parse_taobao_products(auction, keyword, self.curr_num_of_img) self.curr_num_of_img = self.curr_num_of_img + 1 yield taobaoproduct
class SinasportsSpider(scrapy.Spider): name = 'sinasports' allowed_domains = ['sports.sina.com.cn'] start_urls = ['http://sports.sina.com.cn/'] matcher_api_url = 'http://sports.sina.com.cn/iframe/js/2015/live.js?dpc=1' nbanews_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111304795819583663854_1535281251386 &pcProduct=31&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1 &length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22 tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22 timestamp%22%3A1535281251422+%7D&_=1535281251395]''' intsoc_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772 &pcProduct=30&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count %22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22 %3A%22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782658971''' chisoc_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772 &pcProduct=29&ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count %22%3A100%2C%22platform%22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A %22http%3A%2F%2Fsports.sina.com.cn%2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659005''' cba_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772&pcProduct=32 &ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform %22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn %2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659010''' sum_url = '''http://cre.mix.sina.com.cn/get/cms/feed?callback=jQuery111306164420163923745_1535782658772&pcProduct=33 &ctime=&merge=3&mod=pcsptw&cre=tianyi&statics=1&length=12&ad=%7B%22rotate_count%22%3A100%2C%22platform %22%3A%22pc%22%2C%22channel%22%3A%22tianyi_pcspt%22%2C%22page_url%22%3A%22http%3A%2F%2Fsports.sina.com.cn %2F%22%2C%22timestamp%22%3A1535782658814+%7D&_=1535782659015''' max_car_number = get_max_num('sinacar') if max_car_number is None: max_car_number = 0 curr_num_of_car = max_car_number + 1 max_hotmatnews_num = get_max_num('hotmatch_news') if max_hotmatnews_num is None: max_hotmatnews_num = 0 curr_num_of_hmn = max_hotmatnews_num + 1 max_nbanews_num = get_max_num('nbanews') if max_nbanews_num is None: max_nbanews_num = 0 curr_num_of_nba = max_nbanews_num + 1 max_lefttop_num = get_max_num('lefttop') if max_lefttop_num is None: max_lefttop_num = 0 curr_num_of_letop = max_lefttop_num + 1 max_leftsec_num = get_max_num('leftsec') if max_leftsec_num is None: max_leftsec_num = 0 curr_num_of_lesec = max_leftsec_num + 1 def start_requests(self): yield Request(self.nbanews_url, meta={ "newstype": "NBA", "request_url": self.nbanews_url }, callback=self.parse_nba_news) yield Request(self.intsoc_url, meta={ "newstype": "INTSOC", "request_url": self.intsoc_url }, callback=self.parse_nba_news) yield Request(self.chisoc_url, meta={ "newstype": "CHISOC", "request_url": self.chisoc_url }, callback=self.parse_nba_news) yield Request(self.cba_url, meta={ "newstype": "CBA", "request_url": self.cba_url }, callback=self.parse_nba_news) yield Request(self.sum_url, meta={ "newstype": "SUM", "request_url": self.sum_url }, callback=self.parse_nba_news) for start_url in self.start_urls: yield Request(start_url, callback=self.parse_main) yield Request(self.matcher_api_url, callback=self.parse_matches) def parse_matches(self, response): hotmatches = HotMatches() html = response.text html = html[:-13] html = html[42:] resultjson = json.loads(html) matches = resultjson['matches'] for match in matches: livecast_id = match['livecast_id'] shorttitle = match['ShortTitle'] round_cn = match['Round_cn'] title = shorttitle + round_cn team1 = match['Team1'] team2 = match['Team2'] score1 = match['Score1'] score2 = match['Score2'] if not score1.strip() and not score2.strip(): matchtype = 'pre' else: matchtype = 'post' matchdate = match['date'] matchdate = matchdate[5:] matchtime = match['time'] newsurl = match['NewsUrl'] liveurl = match['LiveUrl'] match_url = match['match_url'] hotmatches['livecast_id'] = livecast_id hotmatches['type'] = matchtype hotmatches['title'] = title hotmatches['team1'] = team1 hotmatches['team2'] = team2 hotmatches['score1'] = score1 hotmatches['score2'] = score2 hotmatches['matchdate'] = matchdate hotmatches['matchtime'] = matchtime hotmatches['newsurl'] = newsurl hotmatches['liveurl'] = liveurl hotmatches['match_url'] = match_url if hotmatches['team1'] and hotmatches['team2']: yield hotmatches def parse_main(self, response): sinacarousel = SinaCarousel() hotmatchnews = HotMatchNews() soup = BeautifulSoup(response.text, 'lxml') post_nodes = soup.select( "ul[class='slide-focus-d-cont'] li[class='clearfix thumbnail-b-gra']" ) post_nodes1 = soup.select("div[node-type='tytopwrap']") lefttopimg_node = soup.select( "div[data-sudaclick='blk_focusvideo'] div[class='thumbnail-b thumbnail-b-gra thumbnail-b-video']" )[0] post_nodes2 = soup.select( "div[data-sudaclick='blk_focusvideo'] div[class='layout-mt-g news-list-e'] p" ) leftsecond_node = soup.select( "div[class='layout-mt-h layout-mb-e news-hot']")[0] lefttop_nbanews = parse_lefttop(lefttopimg_node) yield lefttop_nbanews lefttoplines_nbanewslist = parse_lefttoplines(post_nodes2) for i in range(0, len(lefttoplines_nbanewslist)): if lefttoplines_nbanewslist[i] is not None: yield lefttoplines_nbanewslist[i] leftsec_nbanews = parse_leftsecond(leftsecond_node) yield leftsec_nbanews leftsectxt_nbanewslist = parse_leftsectxt(leftsecond_node) for i in range(0, len(leftsectxt_nbanewslist)): yield leftsectxt_nbanewslist[i] for post_node in post_nodes: sel = Selector( text=str(post_node), type="html", ) title = sel.xpath('//p/text()').extract()[0].strip() url = sel.xpath('//a//@href').extract()[0].strip() img_url = sel.xpath('//img//@src').extract()[0].strip() file_name = "carousel_%s.jpg" % self.curr_num_of_car file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\sinasports", file_name) urllib.request.urlretrieve(img_url, file_path) sinacarousel["number"] = self.curr_num_of_car self.curr_num_of_car = self.curr_num_of_car + 1 sinacarousel["title"] = title sinacarousel["url"] = url #carousel_item["img_url"] = img_url sinacarousel["img_url"] = "../static/img/sinasports/%s" % file_name yield sinacarousel for post_node1 in post_nodes1: sel1 = Selector( text=str(post_node1), type="html", ) titles = sel1.xpath('//h3/a') title1 = '' title2 = '' title3 = '' title1url = '' title2url = '' title3url = '' if len(titles) == 3: title1 = sel1.xpath('//h3/a/text()').extract()[0].strip() title2 = sel1.xpath('//h3/a/text()').extract()[1].strip() title3 = sel1.xpath('//h3/a/text()').extract()[2].strip() title1url = sel1.xpath('//h3/a//@href').extract()[0].strip() title2url = sel1.xpath('//h3/a//@href').extract()[1].strip() title3url = sel1.xpath('//h3/a//@href').extract()[2].strip() elif len(titles) == 2: title1 = sel1.xpath('//h3/a/text()').extract()[0].strip() title2 = sel1.xpath('//h3/a/text()').extract()[1].strip() title3 = '' title1url = sel1.xpath('//h3/a//@href').extract()[0].strip() title2url = sel1.xpath('//h3/a//@href').extract()[1].strip() title3url = '' elif len(titles) == 1: title1 = sel1.xpath('//h3/a/text()').extract()[0].strip() title2 = '' title3 = '' title1url = sel1.xpath('//h3/a//@href').extract()[0].strip() title2url = '' title3url = '' else: pass imgurl = sel1.xpath('//div[@class="ty-card-thumb-w"]/a//@href' ).extract()[0].strip() imgsrcurl = sel1.xpath('//img//@src').extract()[0].strip() imgsrcurl = 'http:' + imgsrcurl file_name1 = "hotmatchnews_%s.jpg" % self.curr_num_of_hmn file_path1 = os.path.join( "D:\StefanClub\StefanClub\www\static\img\sinasports", file_name1) urllib.request.urlretrieve(imgsrcurl, file_path1) li1 = sel1.xpath('//li').extract()[0].strip() li2 = sel1.xpath('//li').extract()[1].strip() li3 = sel1.xpath('//li').extract()[2].strip() subsel1 = Selector( text=str(li1), type="html", ) subsel2 = Selector( text=str(li2), type="html", ) subsel3 = Selector( text=str(li3), type="html", ) lia1 = subsel1.xpath('//a') lia2 = subsel2.xpath('//a') lia3 = subsel3.xpath('//a') line1 = '' line2 = '' line3 = '' line4 = '' line5 = '' line6 = '' line7 = '' line8 = '' line9 = '' line1url = '' line2url = '' line3url = '' line4url = '' line5url = '' line6url = '' line7url = '' line8url = '' line9url = '' if len(lia1) == 3: line1 = subsel1.xpath('//a/text()').extract()[0].strip() line2 = subsel1.xpath('//a/text()').extract()[1].strip() line3 = subsel1.xpath('//a/text()').extract()[2].strip() line1url = subsel1.xpath('//a//@href').extract()[0].strip() line2url = subsel1.xpath('//a//@href').extract()[1].strip() line3url = subsel1.xpath('//a//@href').extract()[2].strip() elif len(lia1) == 2: line1 = subsel1.xpath('//a/text()').extract()[0].strip() line2 = subsel1.xpath('//a/text()').extract()[1].strip() line3 = '' line1url = subsel1.xpath('//a//@href').extract()[0].strip() line2url = subsel1.xpath('//a//@href').extract()[1].strip() line3url = '' elif len(lia1) == 1: line1 = subsel1.xpath('//a/text()').extract()[0].strip() line2 = '' line3 = '' line1url = subsel1.xpath('//a//@href').extract()[0].strip() line2url = '' line3url = '' else: pass if len(lia2) == 3: line4 = subsel2.xpath('//a/text()').extract()[0].strip() line5 = subsel2.xpath('//a/text()').extract()[1].strip() line6 = subsel2.xpath('//a/text()').extract()[2].strip() line4url = subsel2.xpath('//a//@href').extract()[0].strip() line5url = subsel2.xpath('//a//@href').extract()[1].strip() line6url = subsel2.xpath('//a//@href').extract()[2].strip() elif len(lia2) == 2: line4 = subsel2.xpath('//a/text()').extract()[0].strip() line5 = subsel2.xpath('//a/text()').extract()[1].strip() line6 = '' line4url = subsel2.xpath('//a//@href').extract()[0].strip() line5url = subsel2.xpath('//a//@href').extract()[1].strip() line6url = '' elif len(lia2) == 1: line4 = subsel2.xpath('//a/text()').extract()[0].strip() line5 = '' line6 = '' line4url = subsel2.xpath('//a//@href').extract()[0].strip() line5url = '' line6url = '' else: pass if len(lia3) == 3: line7 = subsel3.xpath('//a/text()').extract()[0].strip() line8 = subsel3.xpath('//a/text()').extract()[1].strip() line9 = subsel3.xpath('//a/text()').extract()[2].strip() line7url = subsel3.xpath('//a//@href').extract()[0].strip() line8url = subsel3.xpath('//a//@href').extract()[1].strip() line9url = subsel3.xpath('//a//@href').extract()[2].strip() elif len(lia3) == 2: line7 = subsel3.xpath('//a/text()').extract()[0].strip() line8 = subsel3.xpath('//a/text()').extract()[1].strip() line9 = '' line7url = subsel3.xpath('//a//@href').extract()[0].strip() line8url = subsel3.xpath('//a//@href').extract()[1].strip() line9url = '' elif len(lia3) == 1: line7 = subsel3.xpath('//a/text()').extract()[0].strip() line8 = '' line9 = '' line7url = subsel3.xpath('//a//@href').extract()[0].strip() line8url = '' line9url = '' else: pass hotmatchnews["number"] = self.curr_num_of_hmn self.curr_num_of_hmn = self.curr_num_of_hmn + 1 hotmatchnews["title1"] = title1 hotmatchnews["title2"] = title2 hotmatchnews["title3"] = title3 hotmatchnews["title1url"] = title1url hotmatchnews["title2url"] = title2url hotmatchnews["title3url"] = title3url hotmatchnews[ "imgsrcurl"] = "../static/img/sinasports/%s" % file_name1 hotmatchnews["imgurl"] = imgurl hotmatchnews["line1"] = line1 hotmatchnews["line2"] = line2 hotmatchnews["line3"] = line3 hotmatchnews["line1url"] = line1url hotmatchnews["line2url"] = line2url hotmatchnews["line3url"] = line3url hotmatchnews["line4"] = line4 hotmatchnews["line5"] = line5 hotmatchnews["line6"] = line6 hotmatchnews["line4url"] = line4url hotmatchnews["line5url"] = line5url hotmatchnews["line6url"] = line6url hotmatchnews["line7"] = line7 hotmatchnews["line8"] = line8 hotmatchnews["line9"] = line9 hotmatchnews["line7url"] = line7url hotmatchnews["line8url"] = line8url hotmatchnews["line9url"] = line9url yield hotmatchnews def parse_nba_news(self, response): nbanews = NbaNews() newstype = response.meta.get("newstype", "") request_url = response.meta.get("request_url", "") html = response.text if html[0:6] == 'jQuery': html = html[:-3] html = html[43:] prejson = json.loads(html) if "data" in prejson.keys(): nbanewslist = prejson['data'] for nbanewsitem in nbanewslist: imgsrcurl = nbanewsitem["thumb"] imgurl = '' isvideo = 'FALSE' file_name = '' nbanews["number"] = -1 if imgsrcurl: imgurl = nbanewsitem["url"] if imgurl is None: imgurl = nbanewsitem["url_https"] #if nbanewsitem.has_key('video_id'): if "video_id" in nbanewsitem.keys(): video_id = nbanewsitem["video_id"] if video_id is not None: isvideo = 'TRUE' file_name = "nbanews_%s.jpg" % self.curr_num_of_nba file_path = os.path.join( "D:\StefanClub\StefanClub\www\static\img\sinasports", file_name) urllib.request.urlretrieve(imgsrcurl, file_path) nbanews["number"] = self.curr_num_of_nba self.curr_num_of_nba = self.curr_num_of_nba + 1 title = nbanewsitem["title"] titleurl = nbanewsitem["url"] if titleurl is None: titleurl = nbanewsitem["url_https"] newstime = nbanewsitem["mtime"] newstime = datetime.datetime.fromtimestamp(newstime) comment_id = nbanewsitem["new_commentid"] channnel = comment_id[0:2] newsid = comment_id[3:-2] comment_url = "http://comment5.news.sina.com.cn/comment/skin/default.html?channel=" + channnel + "&newsid=" + newsid labels = nbanewsitem["labels"] labellist = [''] * 5 if isinstance(labels, dict): i = 0 for key in labels.keys(): labellist[i] = key i += 1 nbanews[ "imgsrcurl"] = "../static/img/sinasports/%s" % file_name nbanews["imgurl"] = imgurl nbanews["isvideo"] = isvideo nbanews["title"] = title nbanews["titleurl"] = titleurl nbanews["newstime"] = newstime nbanews["comment_url"] = comment_url jj = 1 for j in labellist: nbanews["tag%s" % jj] = j nbanews["tag%surl" % jj] = "//tags.sports.sina.com.cn/" + j jj += 1 nbanews["newstype"] = newstype yield nbanews else: yield Request(request_url, meta={ "newstype": newstype, "request_url": request_url }, callback=self.parse_nba_news)