def parse(self, response): item_list = response.xpath("//div[@class='post_item']") for item in item_list: blogItem = BaiduItem() # blogItem['title'] = item.xpath('./div[2]/h3[1]/a/text()')[0].extract().strip() # blogItem['titleLink'] = item.xpath('./div[2]/h3[1]/a/@href')[0].extract() # blogItem['suggestNum'] = item.xpath('./div[1]/div[1]/span/text()')[0].extract().strip() # content = item.xpath('./div[2]/p[1]/text()') # for c in content: # if c.extract().strip() != '': # blogItem['content'] = c.extract().strip() # else: # blogItem['content'] = '' # blogItem['author'] = item.xpath('./div[2]/div[1]/a/text()')[0].extract().strip() # blogItem['publishDate'] = item.xpath('./div[2]/div[1]/text()')[1].extract().strip() yield blogItem # handle next page next = response.xpath( "//div[@class='pager']/a[last()]/text()")[0].extract().strip() print(next) if 'Next' in next: nextUrlStr = response.xpath( "//div[@class='pager']/a[last()]/@href")[0].extract().strip() nextNum = nextUrlStr[-1] nextNum = 2 if int(nextNum) < 5: nextUrl = 'https://www.cnblogs.com/#p/{0}'.format(nextNum) print(nextUrl) nextNum += 1 yield scrapy.Request(nextUrl, callback=self.parse)
def parse_rel(self, html): """解析相关搜索""" item = BaiduItem() for row in html.find_all('a'): item['url'] = row['href'] item['title'] = row.get_text().strip() yield item
def parse(self, response): print(response.text) # 模拟浏览器 self.driver.get("http://www.baidu.com") # 根据id找到百度的输入框 input_element = self.driver.find_element_by_id('kw') # 模拟输入搜索内容 input_element.send_keys("爬虫") # 根据id找到百度的确认按钮 click_button = self.driver.find_element_by_id('su') #模拟点击事件 click_button.click() # 等5s self.driver.implicitly_wait(5) # 根据classname获取列表数据 result_list = self.driver.find_elements_by_class_name('result') for result in result_list: item = BaiduItem() print("列表标题:" + result.text) print( "===========================================================================" ) item["name"] = result.text yield item
def parse_lyric(self, response): hxs = HtmlXPathSelector(response) item = BaiduItem() content = hxs.select('//*[@id="lyricCont"]/text()') ##lyric lyric = '' for word in content: lyric += word.extract() item['lyric'] = lyric ## hot: define how popular the song hot = hxs.select('//span[@class="num"]/text()').extract()[0] item['hot'] = hot ## singer singer = hxs.select( '//span[@class="author_list"]/a/text()').extract()[0] patt = '\s' item['singer'] = re.sub(patt, '', singer) ## songName songName = hxs.select( '//ul[@class="path-list clearfix"]/li[4]/text()').extract()[0] item['songName'] = songName return item
def parse_item(self, response): sel = Selector(response) items = [] item = BaiduItem() title = sel.xpath('//span[@class="ask-title "]/text()').extract() for i in title: items.append(i) item['TitleName'] = items return item
def parse(self, response): filename = response.url.split(".")[-1] + '.txt' for h2 in response.xpath('//article[@class="entry-common clearfix"]'): item = BaiduItem() item['title'] = str.strip( h2.xpath('header/h2/a/text()').extract()[0]) item['url'] = h2.xpath('header/h2/a/@href').extract()[0] # item['desc'] =str.strip(h2.xpath('header/div[2]/span/text()').extract()[0]) yield item
def parse(self, response): jsonresp = json.loads(response.body_as_unicode()) for o in jsonresp['data']: if 'thumbURL' in o: l = ItemLoader(item=BaiduItem(), response=response) #image_url l.add_value('image_urls', o['thumbURL']) l.add_value('image_type', tag) yield l.load_item()
def get_tab2(self, response): items = [] for rank in response.xpath('./div[@class="grayborder"]/table[@class="list-table"]/tbody/tr[7]/following-sibling::tr'): item = BaiduItem() item['rank_category'] = 'zongyi' item['rank_name'] = rank.xpath('./td[@class="keyword"]/a[@class="list-title"]/text()').extract() item['rank_index'] = rank.xpath('./td[@class="last"]/span/text()').extract() items.append(item) return items
def parse_week(self, browser): # TODO: 定义变量,收集结果 lst=[] for actived in ['搜索指数','资讯指数']: # import ipdb; ipdb.set_trace() # act=browser.find_element_by_xpath('//*[text()[contains(.,"搜索指数")]]').text if actived == '搜索指数': browser.find_element_by_xpath('//*[text()[contains(.,"搜索指数")]]').click() else: browser.find_element_by_xpath('//*[text()[contains(.,"资讯指数")]]').click() try: items = browser.find_elements_by_css_selector('.tab-content .list > .list-item') # import ipdb; ipdb.set_trace() data = browser.find_elements_by_css_selector('.date-text')[0].text one = browser.find_elements_by_css_selector('.actived')[0].text for item in items: # 排名 rank = item.find_element_by_css_selector('.content .rank').text.strip() # 姓名 name = item.find_element_by_css_selector('.content .name').text.strip() # 行指数 line = item.find_element_by_css_selector('.content .line-light') # 行指数-真实值 real_value = float(line.value_of_css_property('width').replace('px', '')) # 行指数-最大值元素 line_max = line.find_element_by_xpath('..') # 最大值 max_value = float(line_max.value_of_css_property('width').replace('px', '')) # 指数值 index = round(100 * real_value / max_value, 2) # 指数字符串 index_str = str(index).rstrip('0').rstrip('.') + '%' cc=BaiduItem() cc['one']=one cc['data']=data cc['actived']=actived cc['rank']=rank cc['name']=name cc['index_str']=index_str print(index_str) print('-' * 30) # yield cc lst.append(cc) # import ipdb; ipdb.set_trace() except (NoSuchElementException, StaleElementReferenceException): import ipdb; ipdb.set_trace() pass # TODO: 返回结果 return lst
def parse(self, response): sel = Selector(response) items = [] item = BaiduItem() title = sel.xpath('//div[@class="question-title"]/a/text()').extract() for i in title: items.append(i) item['TitleName'] = items # print(item['TitleName']) return item
def parse_week(self, browser, nav_name, someweek, exponent): ''' 搜素指数 ''' # 定义变量,收集结果 # import ipdb; ipdb.set_trace() result = [] try: # 获取每一页的每一行的数据 items = browser.find_elements_by_css_selector('.tab-content .list > .list-item') # 分别便利获取每个娱乐圈人的数据 for item in items: baidu = BaiduItem() # 排名 baidu['rank'] = item.find_element_by_css_selector('.content .rank').text.strip() # 姓名 baidu['name'] = item.find_element_by_css_selector('.content .name').text.strip() if nav_name == '周上升榜': # 趋势 # import ipdb; ipdb.set_trace() trend = item.find_element_by_xpath('//div[@class="trend"]/span').text baidu['trend'] = trend else: # 趋势 trend = item.find_element_by_xpath('//span[@class="trend-icon"]/i').get_attribute('class') baidu['trend'] = re.sub('icon trend-', '', trend) # 行指数 line = item.find_element_by_css_selector('.content .line-light') # 行指数-真实值 real_value = float(line.value_of_css_property('width').replace('px', '')) # 行指数-最大值元素 line_max = line.find_element_by_xpath('..') # 最大值 max_value = float(line_max.value_of_css_property('width').replace('px', '')) # 指数值 index = round(100 * real_value / max_value, 2) # 指数字符串 index_str = str(index).rstrip('0').rstrip('.') + '%' # 指数比例 baidu['index_str'] = index_str # 棒单名 baidu['nav_name'] = nav_name # 搜索指数 和资讯指数 baidu['exponent'] = exponent # 日期 baidu['week'] = someweek result.append(baidu) print(nav_name, '--->', exponent, '--->', someweek, '--->', index_str) print('-' * 30) # import ipdb; ipdb.set_trace() except (NoSuchElementException, StaleElementReferenceException): import ipdb; ipdb.set_trace() # 返回结果 return result
def parse(self, response): item = BaiduItem() soup = BeautifulSoup(response.text, 'html.parser') title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text subhead = soup.select('.lemmaWgt-lemmaTitle-title h2') if len(subhead) is not 0: # print(subhead[0].text) title = title + subhead[0].text item['title'] = title info_list = soup.select('.lemma-summary div') info = '' for temp in info_list: # 截取其中文字信息 info += temp.text # 如果有超链接,则继续爬取 a_list = temp.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['info'] = info properties_list = soup.select('.basicInfo-block dt') properties = '' for pro in properties_list: properties += '###' + pro.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = pro.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['properties'] = properties values_list = soup.select('.basicInfo-block dd') values = '' for val in values_list: values += '###' + val.text.strip().replace('\n', '') # 如果有超链接,则继续爬取 a_list = val.select('a') if len(a_list) is not 0: for a in a_list: if a.has_attr('href'): yield Request(self.base_url + a['href'], headers=self.headers) item['values'] = values if len(soup.select('.summary-pic img')) is not 0: item['img'] = soup.select('.summary-pic img')[0]['src'] print(item['title']) yield item
def parse(self, response): # 将响应内容生成Selector,用于数据清洗 sel = Selector(response) items = [] # 定义BaiduItem对象 item = BaiduItem() title = sel.xpath('//div[@class="question-title"]/a/text()').extract() for i in title: items.append(i) item['TitleName'] = items return item
def DownlordImg(self, response): img = BaiduItem() content = response.xpath("//*[@id='big-pic']/p/a/img") if content: img['image_urls'] = content.xpath(".//@src").extract() yield img links = response.xpath("/html/body/div[3]/div[2]/div[9]/ul/li") for j in links: if j.xpath(".//a/text()").extract_first() == '下一页': next_link = 'https://www.aitaotu.com/' + j.xpath( ".//a/@href").extract_first() yield scrapy.Request(next_link, callback=self.DownlordImg)
def parse(self, response): num = 0 company = response.meta["company"] list_content = response.css("#content_left").xpath("./div") for i in list_content: url = i.css(".t").xpath("./a/@href").get() title = i.css(".t").xpath("./a").xpath("string(.)").get() summary = i.css(".c-abstract").xpath("string(.)").get() item = BaiduItem() if url and title and summary: num+=1 item["url"] = url item["title"] = title item["summary"] = summary yield item if num <10: item1 = BaiduItem() item1["error"] = company return item1
def get_tieba(self, response): ''' 根据分类爬取此类型所有的贴吧信息 ''' tiebas = response.xpath('//*[@id="ba_list"]/div') for tieba in tiebas: item = BaiduItem() # 初始化item并进行赋值保存 try: try: item['title'] = response.meta['title'] except KeyError: item['title'] = 'title invalid!' try: item['tag'] = response.meta['tag'] except: item['tag'] = 'tag invalid!' item['name'] = tieba.xpath( 'a/div/p[@class="ba_name"]/text()').extract()[0] item['img'] = tieba.xpath('a/img/@src').extract()[0] item['user_num'] = tieba.xpath( 'a/div/p[@class="ba_num clearfix"]/span[@class="ba_m_num"]/text()' ).extract()[0] item['topic_num'] = tieba.xpath( 'a/div/p[@class="ba_num clearfix"]/span[@class="ba_p_num"]/text()' ).extract()[0] try: item['description'] = tieba.xpath( 'a/div/p[@class="ba_desc"]/text()').extract()[0] except KeyError: item['description'] = 'not find description!' except IndexError: item['description'] = 'not find description!' item['url'] = 'http://tieba.baidu.com' + tieba.xpath( 'a/@href').extract()[0] except Exception as e: print('------------------------') print(e) yield item next_url = response.xpath('//*[@class="next"]/@href').extract() if next_url: # 按段是否存在下一页,存在则翻页抓取 url = 'http://tieba.baidu.com/' + next_url[0] # print(url) yield Request(url, callback=self.get_tieba, meta={ 'title': response.meta['title'], 'tag': response.meta['tag'] })
def parse(self, response): url = response.url title = response.xpath( "//*[@class='lemmaWgt-lemmaTitle-title']/h1/text()").extract()[0] new_urls = response.xpath( "//*[starts-with(@href,'/item/')]/@href").extract() summary = response.xpath("//*[@class='lemma-summary']/div") summary = summary.xpath('string(.)').extract()[0] item = BaiduItem(url=url, title=title, summary=summary) yield item for url in new_urls: self.x += 1 if self.x > 100: break self.url = url yield Request(response.urljoin(url=url), callback=self.parse)
def parse(self, response): div_list = response.xpath('//div[@class="t_con cleafix"]/div[2]/div') for div in div_list: item = BaiduItem() item["author"] = div.xpath('./div[2]/span/@title').extract_first() item["title"] = div.xpath("./div/a/text()").extract_first() item["href"] = div.xpath("./div/a/@href").extract_first() item["time"] = div.xpath("./div[2]/span[2]/text()").extract_first() if item["author"] and item["title"] and item["href"]: item["href"] = "http://tieba.baidu.com" + item['href'] yield scrapy.Request(item["href"], callback=self.parse_detail, meta=item) next_url = response.xpath( '//a[@class="next pagination-item "]/@href').extract_first() if next_url is not None: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): sel = Selector(response) sel_0 = sel.xpath('/html/body/div[@id="body"]') sel_1 = sel_0.xpath('.//ul/li') base_url = get_base_url(response) no = 0 for site in sel_1: no += 1 title = site.xpath('a/text()').extract() if (len(title) != 0 and len(''.join(title)) > 4): link = site.xpath('a/@href').extract() url_m = (str(link))[3:-2] url_new = urljoin(base_url, url_m) item = BaiduItem(title=title, link=link, manufacturer='baiduguonei') yield scrapy.Request(url_new, callback=self.parse_body, meta={'item': item})
def parse_search(self, response): import hashlib params = response.meta.get('params') keywords = response.meta.get('keywords') timestamp = json.loads(response.body_as_unicode())['timestamp'] json_res = json.loads(response.body_as_unicode())['data'] for news in json_res.get('list'): item = BaiduLoader(item=BaiduItem()) # 只爬取最近7天的新闻 if timestamp - news.get('publicTime') > 7*24*3600: return None item.add_value('post_time', time.strftime("%Y-%m-%dT%H:%M:%S+08:00", time.localtime(float(news.get('publicTime'))))) item.add_value('search_word', params.get('word')) item.add_value('author', news.get('author')) item.add_value('title', news.get('title')) item.add_value('link', news.get('url')) item.add_value('hash_id', hashlib.md5(news.get('url').encode('utf8')).hexdigest()) item.add_value('cover_urls', news.get('imgUrl')) # cover image url intro = item.load_item() new_url = 'https://news.baidu.com/news?tn=bdapiinstantfulltext&src=' + news.get('url') if keywords is None: print('error') print(news.get('url')) yield Request(url=new_url, meta={'intro': intro, 'enter_url': news.get('url'), 'keywords': response.meta.get('keywords').split(','), }, callback=self.parse) total = json_res.get('total') is_continue = len(json_res.get('list')) == params['rn'] and params['pn'] < total if is_continue: params['pn'] = params['pn'] + params['rn'] url = "http://m.news.baidu.com/news?tn=bdapinewsearch&" + urllib.parse.urlencode(params) yield scrapy.Request(url=url, meta={'params': params, 'keywords': keywords}, callback=self.parse_search)
def parse(self, response): try: print("进入try..........................") item = BaiduItem() item['comment_num'] = re.findall( r'<span class="red" style="margin-right:3px">(.+?)</span>', response.text) item['read_num'] = '' item['up_num'] = '' item['zhuanfa'] = '' item['url'] = response.meta.get('url') item["MonitorName"] = response.meta.get('MonitorName') if item['comment_num'] == []: item['comment_num'] = '' print("---------------------------------") else: item['comment_num'] = item['comment_num'][0] print(item["url"], item["MonitorName"], item['comment_num'], "====================") except: print("进入except.......................") yield item
def parse_phone(self, response): item = BaiduItem() sort = 0 soup = bs(response.body, 'lxml') urlarr = self.get_url_query(response.url) search = urlarr['wd'] if 'wd' in urlarr.keys() else urlarr['word'] if soup.find('div', class_='rw-list') is not None: rel = self.parse_rel(soup.find('div', class_='rw-list')) else: rel = None if soup.find('div', class_='hint-toprq-tips') is not None: rec = self.parse_rel(soup.find('div', class_='hint-toprq-tips')) else: rec = None if soup.find('div', id='page-controller').find( 'a', class_='new-nextpage') is not None: next_page_url = soup.find('div', id='page-controller').find( 'a', class_='new-nextpage')['href'] else: next_page_url = soup.find('div', id='page-controller').find( 'a', class_='new-nextpage-only')['href'] if not re.search('ms\=1', next_page_url): next_page_url = next_page_url + '&ms=1' connects = soup.find('div', id='results').find_all('div', class_='c-result') for index, connect in enumerate(connects): if (self.page_num == 1): url = 'https://m.baidu.com/su?pre=1&p=3&json=1&wd=%s&sugmode=2&_=1493098255100' % search recommendList = self.get_recommend(url) recLen = len(recommendList) if recLen - 1 >= index: item['recom_search'] = recommendList[index] else: item['recom_search'] = '' else: item['recom_search'] = '' sort += 1 tag_a = connect.find('div', class_='c-container').find('a') item['title'] = tag_a.get_text() if rel is not None: try: data = next(rel) item['rel_url'] = response.url + data['url'] item['rel_tit'] = data['title'] except StopIteration: item['rel_url'] = '' item['rel_tit'] = '' else: item['rel_url'] = '' item['rel_tit'] = '' if rec is not None: try: data = next(rec) item['rec_url'] = self.parse_url(response.url + data['url']) item['rec_tit'] = data['title'] except StopIteration: item['rec_url'] = '' item['rec_tit'] = '' else: item['rec_url'] = '' item['rec_tit'] = '' if connect['data-log'] is not None: data_log = json.loads(connect['data-log'].replace("'", '"')) item['sort'] = data_log['order'] if data_log['mu'] != '': item['url'] = data_log['mu'] else: item['url'] = self.parse_url(tag_a['href']) else: item['url'] = self.parse_url(tag_a['href']) if connect.find('span', class_='c-gray') is not None: item['time'] = connect.find('span', class_='c-gray').get_text() else: item['time'] = '' item['page'] = '第%s页' % self.page_num # self.write_log(tag_a.get_text()) yield item if next_page_url != '' and self.page_num < self.total_page: self.page_num += 1 yield scrapy.Request(url=next_page_url, callback=self.parse_phone)
def parse_pc(self, response): """提取相关数据""" sort = 0 item = BaiduItem() soup = bs(response.body, 'lxml') urlarr = self.get_url_query(response.url) search = urlarr['wd'] if urlarr['wd'] else urlarr['word'] connects = soup.find_all(attrs={"class": "c-container"}) if soup.find('span', 'hint_toprq_tips_items') is not None: rec = self.parse_rec(soup.find('span', 'hint_toprq_tips_items')) else: rec = None if soup.find('div', id='rs') is not None and sort == 0: rel = self.parse_rel(soup.find('div', id='rs')) else: rel = None for index, connect in enumerate(connects): if (self.page_num == 1): url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&cb=&json=1' % search recommendList = self.get_recommend(url) recLen = len(recommendList) if recLen - 1 >= index: item['recom_search'] = recommendList[index] else: item['recom_search'] = '' else: item['recom_search'] = '' sort += 1 if connect.find('h3', 't') is None: item['title'] = connect.find('a').get_text() item['url'] = connect.find('a')['href'] else: item['title'] = connect.find('h3').get_text().strip() item['url'] = self.parse_url(connect.find('h3').a['href']) item['page'] = '第%s页' % self.page_num item['sort'] = sort if connect.find('span', ' newTimeFactor_before_abs m') is not None: item['time'] = connect.find( 'span', ' newTimeFactor_before_abs m').get_text().replace( u'\xa0-\xa0', '') else: item['time'] = '' if rec is not None: try: data = next(rec) item['rec_url'] = self.parse_url(response.url + data['url']) item['rec_tit'] = data['title'] except StopIteration: item['rec_url'] = '' item['rec_tit'] = '' else: item['rec_url'] = '' item['rec_tit'] = '' if rel is not None: try: data = next(rel) item['rel_url'] = response.url + data['url'] item['rel_tit'] = data['title'] except StopIteration: item['rel_url'] = '' item['rel_tit'] = '' else: item['rel_url'] = '' item['rel_tit'] = '' yield item if self.page_num < self.total_page: self.page_num += 1 next_url = response.urljoin( soup.find('div', id='page').find('strong').next_sibling['href']) yield scrapy.Request(next_url, callback=self.parse_pc)
def parse_week(self, browser, ban, index_t): # TODO: 定义变量,收集结果 data = [] try: # 第几周 zhou = browser.find_element_by_css_selector( '.date-text').text.strip() items = browser.find_elements_by_css_selector( '.tab-content .list > .list-item') for item in items: # 排名 rank = item.find_element_by_css_selector( '.content .rank').text.strip() # 姓名 name = item.find_element_by_css_selector( '.content .name').text.strip() # 判断是否是‘周上升榜’ if ban.text == '周上升榜': # 上升位数 index_str = item.find_element_by_css_selector( '.value.upvalue').text.strip() else: # 行指数 line = item.find_element_by_css_selector( '.content .line-light') # 行指数-真实值 real_value = float( line.value_of_css_property('width').replace('px', '')) # 行指数-最大值元素 line_max = line.find_element_by_xpath('..') # 最大值 max_value = float( line_max.value_of_css_property('width').replace( 'px', '')) # 指数值 index = round(100 * real_value / max_value, 2) # 指数字符串 index_str = str(index).rstrip('0').rstrip('.') + '%' trend = None if len(item.find_elements_by_css_selector( '.icon.trend-fair')) > 0: trend = 'fair' elif len(item.find_elements_by_css_selector( '.icon.trend-down')) > 0: trend = 'down' else: trend = 'up' print('排名:{0}, 姓名:{1}, {2}:{3}, {4}:{5}'.format( rank, name, ban.text, zhou, index_t.text, index_str)) print('-' * 30) db_item = BaiduItem() db_item['rank'] = rank db_item['name'] = name db_item['index'] = index_str db_item['trend'] = trend db_item['tab_item'] = ban.text db_item['index_type'] = index_t.text db_item['zhou'] = zhou data.append(db_item) except (NoSuchElementException, StaleElementReferenceException): import ipdb ipdb.set_trace() # TODO: 返回结果 return data
def parse(self, response, typeid, category): print "---> begin execute parse method" item = BaiduItem() items = [] # print response.body print "---> tags: " + typeid data = response.body # print data print "---> category: " + category s = json.loads(data) # print s MyData = s["data"] # print "Data++++++++" # print data num = len(MyData) for i in xrange(0, num - 1): # print MyData[i] print '\n--------------------------------------------------------------------------%d---------------------------------------------------\n' % ( i) print '---> now is : ' + str( time.strftime('%Y-%m-%d %H:%M %S', time.localtime( time.time()))) # print MyData[i]["hoverURL"] width = MyData[i]["width"] height = MyData[i]["height"] size = str(width) + "*" + str(height) print '---> title: ' + MyData[i]["fromPageTitleEnc"] #测试下载使用gif地址 # image = str(MyData[i]["middleURL"]) # print "image------->" + image simid = MyData[i]["os"] Hash = simid.split(",")[0] # now = datetime.utcnow().replace(microsecond=0).isoformat(' ') now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) item['imgsize'] = size item['imgid'] = Hash item['category'] = category item['tag'] = typeid item['updateTime'] = now item['scrawl_time'] = now item['title'] = MyData[i]["fromPageTitleEnc"] item['fromURLHost'] = MyData[i]["fromURLHost"] item['author'] = MyData[i]["fromURLHost"] ObjURL = MyData[i]["objURL"] fromURL = MyData[i]["fromURL"] e = { "w": "a", "k": "b", "v": "c", "1": "d", "j": "e", "u": "f", "2": "g", "i": "h", "t": "i", "3": "j", "h": "k", "s": "l", "4": "m", "g": "n", "5": "o", "r": "p", "q": "q", "6": "r", "f": "s", "p": "t", "7": "u", "e": "v", "o": "w", "8": "1", "d": "2", "n": "3", "9": "4", "c": "5", "m": "6", "0": "7", "b": "8", "l": "9", "a": "0", "_z2C$q": ":", "_z&e3B": ".", "AzdH3F": "/" } #破译obj编码 使用字符替换 没有使用正则(正则学的太烂) TODO:正则 print ObjURL ekeys = e.keys() # print ekeys strLength = len(ekeys) URL_O = ObjURL URL_O = URL_O.replace("_z2C$q", ":") URL_O = URL_O.replace("_z&e3B", ".") URL_O = URL_O.replace("AzdH3F", "/") URLLength = len(URL_O) print URL_O URL_F = fromURL URL_F = URL_F.replace("_z2C$q", ":") URL_F = URL_F.replace("_z&e3B", ".") URL_F = URL_F.replace("AzdH3F", "/") URL_FLength = len(URL_F) s_f = "" #解析fromURL TODO for j in xrange(0, URL_FLength): URLKey = URL_F[j] url = ord(URLKey) if (url >= ord('a') and url <= ord('w')) or (url >= ord('0') and url <= ord('9')): str_url = e[str(URLKey)] s_f = s_f + str_url else: s_f = s_f + URLKey print s_f s = "" for j in xrange(0, URLLength): URLKey = URL_O[j] url = ord(URLKey) if (url >= ord('a') and url <= ord('w')) or (url >= ord('0') and url <= ord('9')): str_url = e[str(URLKey)] s = s + str_url else: s = s + URLKey hash_url = hashlib.md5(s).hexdigest()[8:-8] item['linkmd5id'] = hash_url print "---> hash_url: " + hash_url print "---> url_orgin: " + s #图片下载本地地址 # path = "/Users/chenxingwang/Desktop/"+ category +"/"+hash_url[:2]+"/"+hash_url uploadUrl = "/" + category + "/" + hash_url[:2] + "/" + hash_url path = "../output/gif" + uploadUrl print "---> folder for saving image: " + path isExists = os.path.exists(path) if not isExists: os.makedirs(path) origin_filename = "origin.gif" static_filename = "static.jpg" thumb_filename = "thumb.gif" detail_filename = "detail.gif" url_orgin = s url_thumb = MyData[i]["thumbURL"] + ".gif" url_hover = MyData[i]["hoverURL"] url_middle = MyData[i]["middleURL"] + ".gif" item['fromURL'] = s_f item['objURL'] = uploadUrl + "/" + origin_filename item['hoverURL'] = uploadUrl + "/" + static_filename item['thumbURL'] = uploadUrl + "/" + thumb_filename item['middleURL'] = uploadUrl + "/" + detail_filename item['filesize'] = 0 item['frame'] = 0 yield item sleep(1) # items.append(item) # STEP 1. 下载缩略图 print "---> STEP 1. begin download thumb image: " + url_thumb req = urllib2.Request( url_thumb, headers={ "Upgrade-Insecure-Requests": "1", "X-DevTools-Emulate-Network-Conditions-Client-Id": "7A55439C-E6CF-420D-B294-7635B17E648B", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8" }) try: # print "thumb image to download ----------->" + url_thumb img_path = path + "/" + thumb_filename imgData = urllib2.urlopen(req).read() f = file(img_path, "wb") f.write(imgData) f.close() # generate static.jpg Image.open(img_path).convert('RGB').save(path + "/" + static_filename) print('------> thumb image downloaded. saved to ' + img_path) except IOError as err: print("------> IO error:{0}".format(err)) except: print '------> download thumb image occured some error' print("------> Unexpected error:", sys.exc_info()) file_name = path + "/" + origin_filename # STEP 2. 下载原图 print "---> STEP 2. begin download origin image: " + url_orgin try: # 2 / 0 urllib.urlretrieve(url_orgin, '%s' % (file_name)) print('------> origin image downloaded. saved to ' + file_name) except IOError as err: print("------> IO error:{0}".format(err)) except: print("------> Unexpected error:", sys.exc_info()) print '------> download origin image occurred some error, skip this item. !!!!!!!!!!!!!!!!!!!' item[ 'customer_exceptions'] = 'download origin image occurred some error' yield item # STEP 3. 生成详情图:detail.gif print "---> STEP 3. begin generate detail image" try: # extract fileinfo of origin.gif im = Image.open(file_name) # in KB origin_size = os.stat(file_name).st_size / 1024 item['filesize'] = str(origin_size) origin_frame_count = 1 try: while 1: im.seek(im.tell() + 1) origin_frame_count = origin_frame_count + 1 except EOFError: pass # end of sequence item['frame'] = str(origin_frame_count) print "------> origin image info : size-" + str( origin_size) + "KB, frames-" + str(origin_frame_count) # generate detail.gif origin_size_threshold = 1.5 if origin_size > origin_size_threshold * 1024: print "------> origin image is bigger than " + str( origin_size_threshold) + "M" im = Image.open(file_name) tmp_path = path + "/temp/" if not os.path.exists(tmp_path): os.makedirs(tmp_path) print '------> origin file info: ' + str(im.info) if 'duration' in im.info.keys(): origin_duration = im.info['duration'] / 1000.00 else: origin_duration = 0 temp_filenames = [] # index = 1 reader = imageio.get_reader(file_name) for i, tmp_im in enumerate(reader): imageio.imwrite("%sframe%d.png" % (tmp_path, i), tmp_im) temp_filenames.append("%sframe%d.png" % (tmp_path, i)) # for frame in ImageSequence.Iterator(im): # frame.save("%sframe%d.png" % (tmp_path, index)) # temp_filenames.append("%sframe%d.png" % (tmp_path, index)) # index += 1 # print temp_filenames with imageio.get_writer( path + "/" + detail_filename, mode='I', duration=origin_duration) as writer: for temp_filename in temp_filenames: tmp_im = Image.open(temp_filename) tmp_im.thumbnail((230, 230)) tmp_im.save(temp_filename) image = imageio.imread(temp_filename) writer.append_data(image) shutil.rmtree(tmp_path) print '------> end: generate detail.gif' else: print "------> copy origin.gif as detail.gif" shutil.copyfile(file_name, path + "/" + detail_filename) # a = 2 / 0 except IOError as err: print("------> IO error:{0}".format(err)) except: print("------> Unexpected error:", sys.exc_info()) print '------> generate detail.gif occured some error' print "---> finished data parse : " + hash_url yield item sleep(1)
def parse(self, response, typeid, category): print "---> begin execute parse method" item = BaiduItem() items = [] # print response.body print "---> tags: " + typeid data = response.body # print data print "---> category: " + category s = json.loads(data) # print s MyData = s["data"] # print "Data++++++++" # print data num = len(MyData) for i in xrange(0, num - 1): # print MyData[i] print '\n--------------------------------------------------------------------------%d---------------------------------------------------\n' % ( i) print '---> now is : ' + str( time.strftime('%Y-%m-%d %H:%M %S', time.localtime( time.time()))) # print MyData[i]["hoverURL"] width = MyData[i]["width"] height = MyData[i]["height"] size = str(width) + "*" + str(height) print '---> title: ' + MyData[i]["fromPageTitleEnc"] #测试下载使用gif地址 # image = str(MyData[i]["middleURL"]) # print "image------->" + image simid = MyData[i]["os"] Hash = simid.split(",")[0] # now = datetime.utcnow().replace(microsecond=0).isoformat(' ') now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) item['imgsize'] = size item['imgid'] = Hash item['category'] = category item['tag'] = typeid item['updateTime'] = now item['scrawl_time'] = now item['title'] = MyData[i]["fromPageTitleEnc"] item['fromURLHost'] = MyData[i]["fromURLHost"] item['author'] = MyData[i]["fromURLHost"] ObjURL = MyData[i]["objURL"] fromURL = MyData[i]["fromURL"] e = { "w": "a", "k": "b", "v": "c", "1": "d", "j": "e", "u": "f", "2": "g", "i": "h", "t": "i", "3": "j", "h": "k", "s": "l", "4": "m", "g": "n", "5": "o", "r": "p", "q": "q", "6": "r", "f": "s", "p": "t", "7": "u", "e": "v", "o": "w", "8": "1", "d": "2", "n": "3", "9": "4", "c": "5", "m": "6", "0": "7", "b": "8", "l": "9", "a": "0", "_z2C$q": ":", "_z&e3B": ".", "AzdH3F": "/" } #破译obj编码 使用字符替换 没有使用正则(正则学的太烂) TODO:正则 print ObjURL ekeys = e.keys() # print ekeys strLength = len(ekeys) URL_O = ObjURL URL_O = URL_O.replace("_z2C$q", ":") URL_O = URL_O.replace("_z&e3B", ".") URL_O = URL_O.replace("AzdH3F", "/") URLLength = len(URL_O) print URL_O URL_F = fromURL URL_F = URL_F.replace("_z2C$q", ":") URL_F = URL_F.replace("_z&e3B", ".") URL_F = URL_F.replace("AzdH3F", "/") URL_FLength = len(URL_F) s_f = "" #解析fromURL TODO for j in xrange(0, URL_FLength): URLKey = URL_F[j] url = ord(URLKey) if (url >= ord('a') and url <= ord('w')) or (url >= ord('0') and url <= ord('9')): str_url = e[str(URLKey)] s_f = s_f + str_url else: s_f = s_f + URLKey print s_f s = "" for j in xrange(0, URLLength): URLKey = URL_O[j] url = ord(URLKey) if (url >= ord('a') and url <= ord('w')) or (url >= ord('0') and url <= ord('9')): str_url = e[str(URLKey)] s = s + str_url else: s = s + URLKey hash_url = hashlib.md5(s).hexdigest()[8:-8] item['linkmd5id'] = hash_url print "---> hash_url: " + hash_url print "---> url_orgin: " + s #图片下载本地地址 # path = "/Users/chenxingwang/Desktop/"+ category +"/"+hash_url[:2]+"/"+hash_url uploadUrl = "/" + category + "/" + hash_url[:2] + "/" + hash_url path = "../output/gif" + uploadUrl print "---> folder for saving image: " + path # isExists=os.path.exists(path) # if not isExists: # os.makedirs(path) origin_filename = "origin.gif" static_filename = "static.jpg" thumb_filename = "thumb.gif" detail_filename = "detail.gif" url_orgin = s url_thumb = MyData[i]["thumbURL"] + ".gif" url_hover = MyData[i]["hoverURL"] url_middle = MyData[i]["middleURL"] + ".gif" item['fromURL'] = s_f item['objURL'] = uploadUrl + "/" + origin_filename item['hoverURL'] = uploadUrl + "/" + static_filename item['thumbURL'] = uploadUrl + "/" + thumb_filename item['middleURL'] = uploadUrl + "/" + detail_filename item['filesize'] = 0 item['frame'] = 0 item['source_thumb_url'] = url_thumb item['source_original_url'] = s sleep(1) yield item