class JiepaiHuaBanSpider(scrapy.Spider): name = 'jiepai_hua_ban' allowed_domains = ['huaban.com'] start_urls = ['http://huaban.com/favorite/beauty/'] # start_urls = ['http://huaban.com/boards/24116838/?md=newbn&beauty='] def __init__(self, name=None, **kwargs): super(JiepaiHuaBanSpider, self).__init__(name) self.bmob_helper = BMobUploadHelper() def parse(self, response): bsp = BeautifulSoup(response.body, 'lxml') hua_ban_group = bsp.select_one("#waterfall") hua_ban_items = hua_ban_group.select(".pin.wfc") cur_hua_ban_time = time_utils.get_jie_pai_hua_ban_scrapy_time() for hua_ban_item in hua_ban_items: if hua_ban_item.attrs.has_key("data-created-at"): #time hua_ban_item_time_stamp = hua_ban_item["data-created-at"] timeArray = time.localtime(int(hua_ban_item_time_stamp)) hua_ban_item_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) logging.info("time: " + hua_ban_item_time) if cmp(hua_ban_item_time, cur_hua_ban_time) < 0: logging.info("time is out of date, hua_ban_item_time: " + hua_ban_item_time) break #图片 hua_ban_pic_item = hua_ban_item.select_one( ".img.x.layer-view.loaded > img") hua_ban_pic = hua_ban_pic_item["src"] split_index = hua_ban_pic.index("_") hua_ban_url = "http:" + hua_ban_pic[0:split_index] + "_fw658" # hua_ban_url ="http:" + hua_ban_pic group_content = self.bmob_helper.get_group_content( hua_ban_url, "") group_url = "https://api2.bmob.cn/1/classes/Beauty" logging.info("parse_hua_ban_detail group data: " + json.dumps(group_content, ensure_ascii=False)) point_group_id = self.bmob_helper.upload_to_bmob( group_url, group_content) time_utils.save_jie_pai_hua_ban_scrapy_time( time_utils.get_next_day_time())
class JiepaiThreeAppSpider(scrapy.Spider): name = 'jiepai_three_app' allowed_domains = ['app.3ajiepai.com'] start_urls = [ 'http://app.3ajiepai.com/thread/list?fid=170&page=1&pageSize=20' ] def __init__(self, name=None, **kwargs): super(JiepaiThreeAppSpider, self).__init__(name) self.cur_time = time_utils.get_jie_pai_three_m_scrapy_time() self.cookies_jie_pai = {} self.bmob_helper = BMobUploadHelper() def start_requests(self): jsession_id, jie_pai = self.get_login_info() self.cookies_jie_pai = { '__cfduid': 'd038136efebfcd498fc25c12f2a9cbad81539412011', 'JSESSIONID': jsession_id, '3ajiepai': jie_pai } for url in self.start_urls: yield scrapy.Request(url, cookies=self.cookies_jie_pai) def get_login_info(self): login_url = "http://app.3ajiepai.com/wechat/login?code=onQGp1RAFbnzN6m4y259Qma2vMu4" response = requests.get(login_url) jsession_id = response.cookies["JSESSIONID"] jie_pai = response.cookies["3ajiepai"] logging.info("response cookies: " + str(response.cookies)) return jsession_id, jie_pai def parse(self, response): logging.info("jiepai_three_app response: " + response.body) json_content = json.loads(response.body) data_array = json_content["data"] count = 0 for data_item in data_array: # count +=1 # if count ==2: # break #time data_date = data_item["dateline"] timeArray = time.localtime(int(data_date)) jie_pai_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) if cmp(jie_pai_time, self.cur_time) < 0: logging.info("time is out of date, jie_pai_time: " + jie_pai_time) return # 封面 data_thumb = data_item["thumb"] # title data_subject = data_item["subject"] #id data_tid = data_item["tid"] detial_url = "http://app.3ajiepai.com/thread/" + str(data_tid) yield scrapy.Request(detial_url, meta={ "data_thumb": data_thumb, "data_subject": data_subject, }, cookies=self.cookies_jie_pai, callback=self.handle_detail) # 所有的事情都办完了 time_utils.save_jie_pai_three_m_scrapy_time( time_utils.get_next_day_time()) def handle_detail(self, response): data_thumb = response.meta["data_thumb"] data_subject = response.meta["data_subject"] data_detail = json.loads(response.body) photos = data_detail["data"]["photos"] point_group_id = "" sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean" first_photo = True for photo in photos: img_url = photo["origin"] # 第一张: if first_photo: group_content = self.bmob_helper.get_group_content( img_url, data_subject) group_url = "https://api2.bmob.cn/1/classes/CardPicGroup" logging.info("parse_wei_bo_detail group data: " + json.dumps(group_content, ensure_ascii=False)) point_group_id = self.bmob_helper.upload_to_bmob( group_url, group_content) first_photo = False else: detail_content = self.bmob_helper.get_detail_content( "", img_url, point_group_id) logging.info("upload sub_pics json: " + json.dumps(detail_content, ensure_ascii=False)) self.bmob_helper.upload_to_bmob(sub_pic_url, detail_content)
class JiepaiWeiBoSpider(scrapy.Spider): name = 'jiepai_wei_bo' allowed_domains = ['weibo.com'] # start_urls = ['http://photo.weibo.com/1304494805/talbum/index#!/mode/2/page/1'] start_urls = [ 'https://weibo.com/u/1304494805?is_all=1', #街拍美 'https://weibo.com/u/3757458303?is_all=1', #街拍摄美 # 'https://weibo.com/tajiepai?is_all=1'#她街拍 ] def __init__(self, name=None, **kwargs): super(JiepaiWeiBoSpider, self).__init__(name) self.bmob_helper = BMobUploadHelper() self.key_words = ["四年"] self.craw_count = 0 def hit_key_word(self, word): result = False for my_word in self.key_words: result = my_word in word.encode("utf-8") if result: return result return result def parse(self, response): bsp = BeautifulSoup(response.body, 'lxml') wei_bo_group = bsp.select_one(".WB_feed.WB_feed_v3.WB_feed_v4") time.sleep(1) wei_bo_items = wei_bo_group.select(".WB_feed_detail.clearfix") count = 0 title = "" cur_wei_bo_time = time_utils.get_jie_pai_wei_bo_scrapy_time() cur_group_id = "" point_group_id = "" for wei_bo_item in wei_bo_items: # time wei_bo_time_item = wei_bo_item.select_one(".WB_from.S_txt2 > a") wei_bo_time = wei_bo_time_item["title"] if cmp(wei_bo_time, cur_wei_bo_time) < 0: logging.info("time is out of date, wei_bo_time: " + wei_bo_time) continue # titile wei_bo_title_item = wei_bo_item.select_one(".WB_text.W_f14") wei_bo_title = wei_bo_title_item.text wei_bo_title = wei_bo_title.replace("\n", "").strip() reobj = re.compile("\(.*\)") wei_bo_title_result, number = reobj.subn("", wei_bo_title) #通过关键字过滤一些微博 if self.hit_key_word(wei_bo_title): logging.info("hit_key_word title: " + wei_bo_title) continue img_urls = [] #pic wei_bo_pics = wei_bo_item.select(".WB_pic") for wei_bo_pic in wei_bo_pics: img_item = wei_bo_pic.select_one("img") img_url = img_item["src"] final_img_url = "" if "thumb150" in img_url: final_img_url = "http:" + img_url.replace( "thumb150", "mw690") elif "orj360" in img_url: final_img_url = "http:" + img_url.replace( "orj360", "mw690") img_urls.append(final_img_url) if len(img_urls) > 0: #cover cover_url = img_urls[0] #upload cover group_content = self.bmob_helper.get_group_content( cover_url, wei_bo_title_result) group_url = "https://api2.bmob.cn/1/classes/CardPicGroup" logging.info("parse_wei_bo_detail group data: " + json.dumps(group_content, ensure_ascii=False)) point_group_id = self.bmob_helper.upload_to_bmob( group_url, group_content) #upload sub_pics sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean" for index in range(1, len(img_urls)): detail_content = self.bmob_helper.get_detail_content( "", img_urls[index], point_group_id) logging.info( "upload sub_pics json: " + json.dumps(detail_content, ensure_ascii=False)) self.bmob_helper.upload_to_bmob(sub_pic_url, detail_content) self.craw_count = self.craw_count + 1 if self.craw_count == len(self.start_urls): # 所有的事情都办完了 time_utils.save_jie_pai_weibo_scrapy_time( time_utils.get_next_day_time())
class JiepaiSpider(scrapy.Spider): name = 'jiepai' allowed_domains = [ 'blog.sina.com.cn' ,'www.bucuo.me'] start_urls = [ 'http://blog.sina.com.cn/s/articlelist_1340398703_4_1.html' # , # 'https://www.bucuo.me/app/1583407618504778' ] def __init__(self,name=None, **kwargs): super(JiepaiSpider,self).__init__(name) self.sina = "http://blog.sina.com.cn/s/articlelist_1340398703_4_1.html" self.bucou = "https://www.bucuo.me/app/1583407618504778" self.cur_time = time_utils.get_jie_pai_scrapy_time() self.bmob_helper = BMobUploadHelper() self.point_group_id = "" def parse(self, response): bsp = BeautifulSoup(response.body, 'lxml') cur_url = response.url article_class = self.get_article_list_class_by(cur_url) article_list = bsp.select(article_class) count = 0 title ="" for article in article_list: count +=1; # if count == 2: # break article_time_class = self.get_article_time_class_by(cur_url) article_time = article.select(article_time_class) if cmp(cur_url,self.sina) == 0: scrap_time = article_time[0].string else: scrap_time = article_time[0]["title"] #第一个就不满足,可以立即去除掉 if cmp(self.cur_time,scrap_time) > 0: logging.error("jie_pai_group time is out of date cur_time: "+self.cur_time+" scrap_time: "+scrap_time) return link_title_class = self.get_title_class_by(cur_url) link_title = article.select(link_title_class) title = link_title[0].string link = link_title[0]['href'] logging.info('jie_pai_group title: ' + title + ' link: ' + link+" scrap_time: "+scrap_time) jie_pai_group_loader = ItemLoader(item=HCJiePaiGroup(), selector=response) jie_pai_group_loader.add_value('jie_pai_title', title) time.sleep(3) yield jie_pai_group_loader.load_item() call_back = self.parse_sina_detail detail_url = link if cmp(cur_url,self.sina)==0: call_back = self.parse_sina_detail detail_url = link else: call_back = self.parse_bu_cuo_detail detail_url = "https://www.bucuo.me"+link yield scrapy.Request(detail_url, meta={"group_title": title}, callback=call_back) # 所有的事情都办完了 time_utils.save_jie_pai_scrapy_time(time_utils.get_next_day_time()) def parse_bu_cuo_detail(self,response): bsp = BeautifulSoup(response.body, 'lxml') for br in bsp('br'): br.extract() title = response.meta["group_title"] point_group_id = "" jie_pai_details = bsp.select(".body > p") is_first = True p_count = 0 img_url = "" img_desc = "" for jie_pai_detail in jie_pai_details: p_count += 1 img = jie_pai_detail.select('img') if img: #图片 img_url = "" img_url = img[0]["src"] continue else: #文字 img_desc = "" img_desc = jie_pai_detail.string #第一张图片作为封面 if is_first: upload_group_content = self.bmob_helper.get_group_content(img_url, title) url = "https://api2.bmob.cn/1/classes/CardPicGroup" logging.info("parse_bu_cuo_detail group data: " + json.dumps(upload_group_content, ensure_ascii=False)) # point_group_id = self.upload_to_bmob(url, upload_group_content) is_first = False else: #后续图片作为sub_img upload_detail_content = self.bmob_helper.get_news_detail_content(img_desc,img_url,point_group_id) url = "https://api2.bmob.cn/1/classes/CardPicBean" logging.info("parse_bu_cuo_detail detail data: " + json.dumps(upload_detail_content,ensure_ascii=False)) # self.upload_to_bmob(url, upload_detail_content) def parse_sina_detail(self, response): bsp = BeautifulSoup(response.body, 'lxml') for br in bsp('br'): br.extract() title = response.meta["group_title"] jie_pai_details = bsp.select_one('#sina_keyword_ad_area2') jie_pai_detail_links = jie_pai_details.select('a') is_first = True; for jie_pai_detail in jie_pai_detail_links: link_content = jie_pai_detail['href'] if 'photo.blog.sina.com.cn' in link_content: result = self.process_detail(is_first, jie_pai_detail, title) if result and is_first: is_first = False; else: logging.info('jie_pai_detail end') def process_detail(self, is_first, jie_pai_detail, title): jie_pai_detail_img = jie_pai_detail.select('img') img_width = 0 img_height = 0; if jie_pai_detail_img[0].attrs.has_key("width"): img_width = int(jie_pai_detail_img[0]['width']) if jie_pai_detail_img[0].attrs.has_key("height"): img_height = int(jie_pai_detail_img[0]['height']) if img_height > img_width: img_url = jie_pai_detail_img[0]['real_src'] img_desc = jie_pai_detail.next_sibling #另外一种情况获取img_desc img_desc = self.get_img_desc_if_needed(img_desc, jie_pai_detail) logging.info("type: " + str(type(img_desc))) # 第一张图片作为封面 if is_first: # 上传group upload_group_content = self.bmob_helper.get_group_content_with_title(img_url,img_desc, title) url = "https://api2.bmob.cn/1/classes/StyleNews" logging.info("upload_group_content data: " + json.dumps(upload_group_content, ensure_ascii=False)) self.point_group_id = self.bmob_helper.upload_to_bmob(url, upload_group_content) elif type(img_desc) == NavigableString: upload_detail_content = self.bmob_helper.get_news_detail_content(img_desc, img_url, self.point_group_id) url = "https://api2.bmob.cn/1/classes/StyleDetailItem" logging.info("upload json: " + json.dumps(upload_detail_content,ensure_ascii=False)) self.bmob_helper.upload_to_bmob(url, upload_detail_content) # logging.info( # 'jie_pai_detail img_width: ' + str(img_width) + ' img_height: ' + str( # img_height) + ' img_src: ' + img_url + ' img_desc: ' + img_desc) return True else: logging.info("jie_pai_detail img_heigh: " + str(img_height) + " img_width: " + str(img_width)) return False def get_img_desc_if_needed(self, img_desc, jie_pai_detail): while cmp(img_desc, '\n') == 0: img_desc = jie_pai_detail.next_sibling jie_pai_detail = jie_pai_detail.next_sibling # div if cmp("div", jie_pai_detail.name) == 0: img_desc = jie_pai_detail.text return img_desc def get_article_list_class_by(self,url): article_class = "" if cmp(url,self.sina) == 0: article_class = '.articleCell.SG_j_linedot1' elif cmp(url,self.bucou) == 0: article_class = ".art-item" return article_class def get_title_class_by(self,url): title_class = "" if cmp(url,self.sina) == 0: title_class = '.atc_title > a' elif cmp(url,self.bucou) == 0: title_class = "h2 > a" return title_class def get_article_time_class_by(self,url): article_time_class = "" if cmp(url,self.sina) == 0: article_time_class = '.atc_tm.SG_txtc' elif cmp(url,self.bucou) == 0: article_time_class = ".title-info > span" return article_time_class def get_details_class_by(self,url): details_class = "" if cmp(url,self.sina) == 0: details_class = '#sina_keyword_ad_area2' elif cmp(url,self.bucou) == 0: details_class = ".art-item > a" return details_class