def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','Index_ShowDetail_Content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='Index_ShowDetail_Title']/h1/text()" xp_putime = "//div[@class='Index_ShowDetail_Time']//text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) date = new_time() crawl_data = { 'url': url, 'province': u'全国', 'title': title, 'content': content, 'pubtime': pubtime, 'publisher': u'中国质量新闻网', 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'cqn', 'source_type': u'中国质量新闻网', # 'origin_source': u'中国质量新闻网', 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','contaner_nr') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()" xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()" xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'zjbts', 'publisher': u'浙江质监局', 'source_type': u'质监局', # 'origin_source': u'浙江质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): key = str(self.key) data = self.data homepage = "http://api.weibo.cn/2/guest/statuses_extend?v_f=2&\ uid=1001979296366&lfid=230584&checktoken=70b77970ea4b0fb23e95549430204e44&\ c=android&wm=2468_1001&did=006997cad1bdce0960777445e8b8fed211d91950&\ luicode=10000228&from=1051295010&lang=zh_CN&lcardid="+key+"&\ skin=default&i=5c7d1a1&id="+key+"&fromlog=230584&s=9bad809a&\ gsid=4wkmda923WBtPcv1v5vMS15OcAo5U&ua=HUAWEI-HUAWEI%20T8950__weibo__5.1.2__android__android4.0.4&\ oldwm=2468_1001&is_recom=-1&uicode=10000002" homepage = clear_space(homepage) html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) crawl_data = {} reposts_count = json_stream.get('reposts_count', 0) comments_count = json_stream.get('comments_count', 0) attitudes_count = json_stream.get('attitudes_count', 0) date = new_time() crawl_data = { 'id': data.get('wid'), 'reposts': reposts_count, 'comments': comments_count, 'likes': attitudes_count, 'type': data.get('type'), 'crtime_int': date.get('crtime_int'), 'expire': data.get('expire') } model = WeiboHotModel(crawl_data) export(model)
def crawl(self): # fid = '1662' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] fid = self.key category_data = extract_category(self) count = 3 #页数初始值为3 pages = 1 #从第一页开始 while pages <= count: url = self.get_url(fid,pages) try: jsons = ProcessData.get_json_data(url) if pages==1 : count = math.ceil(int(jsons['wareCount'])/100) lists = jsons['wareInfo'] except Exception,e: self.logger.error(url) self.logger.error(e) print 'error ',url return if lists == []: return {} for i in range(len(lists)): ids = uuid.uuid1() #cassandra 主键 wareId = lists[i]['wareId'] try: f = lambda x: int(x[:-1])/100.00 ecsumscores = float(f(lists[i]['good'])) #商品总评分 except: ecsumscores = 0 crawl_data = { # 'id': uuid.uuid1(), 'source_id': wareId, 'source': self.data.get('source'), 'summary': {}, 'title': lists[i]['wname'], 'adword': lists[i]['adword'], 'price': float(lists[i]['jdPrice']), 'original_price': float(lists[i]['martPrice']), 'score': ecsumscores } crawl_data.update(category_data) data = { # 'uuid': ids, 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] # 'presentcategory': self.data['presentcategory'] } model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=wareId, data=data) Scheduler.schedule(CommentCrawler.type, key=wareId, data=data) pages += 1
def crawler_data(self,tree): category_data = extract_category(self) XPATH = self.search_list_xpath if len(tree.xpath(XPATH('list'))) == 0: XPATH = self.product_list_xpath dom = tree.xpath(XPATH('list')) for item in dom: crawl_data = {} craw = [ 'title','adword', 'price','original_price', 'source_id','score', ] for value in craw: crawl_data[value] = self.mackining(item.xpath(XPATH(value))) crawl_data['price'] = float(crawl_data['price']) try: f = lambda x: int(x[:-1])/100.00 crawl_data['score'] = float(f(crawl_data['score'])) except: crawl_data['score'] = 0 crawl_data.update(category_data) crawl_data['source'] = 'yhd' model = EcBasicModel(crawl_data) export(model) data = { 'priorcategory': self.data['priorcategory'], 'presentcategory': self.data['priorcategory'] } data["uuid"] = model["id"] Scheduler.schedule(DetailCrawler.type, key=str(self.key), data=data)
def crawl(self): # wareId = '1229271' # wareId = '1391817787' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ids = uuid.uuid1() wareId = self.key ids = self.data.get('uuid') category_data = extract_category(self) url = 'http://m.360buy.com/product/guige/%s.html'%(str(wareId)) html_stream = ProcessData.get_web_data(url) tree = etree.HTML(html_stream.text) xpath = "//table[@class='Ptable']/tr/td/text()" dom = tree.xpath(xpath) specifications = {} temporary = '' i = 0 for item in dom: item = item.strip() if item == '': continue if i%2 ==0: specifications[item] = '' temporary = extract_title(item) else: specifications[temporary] = extract_text(item) i += 1 data = { 'ecnorms':specifications } # specifications = json.dumps(specifications, ensure_ascii=False) introduce = IntroduceCrawler.crawl(wareId,ids) ecbrands = introduce[u'品牌'] if introduce.get(u'品牌') else '' # ecnames = introduce[u'商品名称'].replace('\'',' ') if introduce.get(u'商品名称') else '' ecnames = introduce[u'商品名称'] if introduce.get(u'商品名称') else '' crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': wareId, 'summary': specifications, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',class_=['rich_media_content',\ 'rich_media_thumb_wrp']) xp_title = "//div[@class='rich_media_area_primary']/\ h2[@class='rich_media_title']/text()" xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\ /text()" xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()" xp_publisher = "//div/a[@id='post-user']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) publisher = HandleContent.get_author(html_stream, xpath=xp_publisher) comment = {} # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \ # y.text.replace('\n','').replace('\r','') # comment['content'] = reduce(con,content) content = clear_label(content, root=homepage) text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) date = new_time() crawl_data = {} crawl_data = { 'province': self.data.get('province',''), 'city': self.data.get('city',''), 'district': self.data.get('district',''), 'url': homepage, 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'sogou', 'author': author, 'publisher': self.data.get('publisher', publisher), 'origin_source': u'微信公共账号', 'type': u'微信', 'comment': comment } if data.get('key'): crawl_data.update(data) model = SearchArticleModel(crawl_data) else: model = WeixinArticleModel(crawl_data) export(model)
def crawl(self): key = self.key data = self.data homepage = "http://card.weibo.com/article/aj/articleshow?cid="+ key url = "http://weibo.com/p/"+ key html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) html_stream = json_stream['data']['article'] soup = HandleContent.get_BScontext(html_stream, text=True) title = soup.select('.title')[0].text pubtime = soup.select('.time')[0].text pubtime = HandleContent.strformat(str(pubtime)) content = soup.select('.WBA_content')[0] content = clear_label(list(content)) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) publishers = soup.select('.S_link2') # author = reduce(lambda x, y: x + y, [item.text for item in authors]) try: publisher = publishers[1].text if len(publishers)> 1 else publishers[0].text except: publisher = '' crawl_data = {} date = new_time() crawl_data = { 'title': title, 'pubtime': pubtime, 'source': 'weibo', 'publisher': publisher, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'origin_source': u'微博搜索', 'url': url, 'key': data.get('key', ''), 'type': u'元搜索', 'source_type': data.get('source_type', ''), 'content': content, 'comment': comment, } model = SearchArticleModel(crawl_data) export(model)
def crawler_data(self,tree): ids = self.data.get('uuid') category_data = extract_category(self) introduce = tree.xpath(self.ware_xpath('introduce')) specifications = tree.xpath(self.ware_xpath('specifications')) introd = {} ecnorms = {} for item in introduce: item = item.strip() if item == '': continue item = item.split(u':',1) try: introd[item[0]] = item[1] except: pass for item in specifications: label = item.xpath(self.ware_xpath('label')) names = [] values = [] for i in label: i = i.strip() if i.strip() == '': continue names.append(i) dd = item.xpath(self.ware_xpath('item')) for i in dd: i = i.strip() if i.strip() == '': continue values.append(i) ecnorms.update(map(lambda x,y:[x,y],names,values)) crawl_data = { 'id': ids, 'source': self.data.get('source'), 'source_id': str(self.key), 'summary': ecnorms, 'introduce': introd, 'version': ecnorms.get(u'型号',''), 'brand': ecnorms.get(u'商品品牌','') } crawl_data.update(category_data) model = EcDetailModel(crawl_data) export(model)
def crawl(self): # wareId = '1229271' # priorcategory = ["家居家装","清洁用品","衣物清洁"] # presentcategory = ['1','2','3'] # ecid = '124' wareId = self.key ecid = self.data['uuid'] category_data = extract_category(self) pages = 1 count = True while count: number = 0 #去重 url = self.get_url(wareId,pages) # print '++++++++= ',url html_stream = ProcessData.get_web_data(url) try: tree = etree.HTML(html_stream.text) except: print 'error: ',url break xpath = "//div[@id='comments-list']/div[@class='mc']" dom = tree.xpath(xpath) if dom == []: count = False continue for item in dom: datas = self.handle(item) comment_data={ # 'uuid': uuid.uuid1(), #primary key 'ecid': ecid, #commodity table foreign key 'source_id': wareId, 'source': self.data.get('source'), 'comment_id': datas['commentid'], #review id 'score': datas['score'], #commodity score 'pubtime': datas['commenttime'], 'buytime': datas['buytime'], 'user_id': datas['url'], # 'usernickName': groups[i]['usernickName'], 'useful': datas['useful'], 'reply': datas['reply'], 'content': datas['comment'], 'province': datas['province'] } comment_data.update(category_data) model = EcCommentModel(comment_data) is_saved = export(model) if is_saved == True: pass else: number += 1 if number > 10: break pages += 1
def crawl(self): data = self.data url = self.key html_stream = _get_url(url) title = get_titles(html_stream) pubtime = local2utc(get_publish_times(html_stream)) soup = Readability(html_stream.text, url) content = soup.content # soup = HandleContent.get_BScontext(html_stream) comment = {} try: text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) except: content = '' pass # comment['key'] = data.get('key','') # comment['count'] = data.get('count','') crawl_data = {} date = new_time() crawl_data = { 'url': url, 'province': data.get('province'), 'city': data.get('city', u''), 'district': data.get('district', u''), 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'yuqing', 'publisher': data.get('publisher', u''), 'source_type': data.get('type'), # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } if comment['content']: model = ZjldArticleModel(crawl_data) export(model)
def crawlHtml(self, html): ids = self.data['uuid'] source = "amazon" source_id = self.key category_data = extract_category(self) summary = {} ecbrands = "" ecnames = "" introduce = {} # 获取 productDetailsTable prodDetails = html.xpath( "//table[@id='productDetailsTable']//tr/td[@class='bucket']/div[@class='content']/ul/li") for proditem in prodDetails: k = proditem.xpath("b/text()")[0].strip()[:-1] if k == "用户评分": summary[k] = proditem.xpath( "span[@class='crAvgStars']/span/a/span/span/text()")[0].strip()[2:-1] # print elif k == "亚马逊热销商品排名": print "a" else: summary[k] = proditem.xpath("text()")[0].strip() crawl_data = { 'id': ids, 'source': source, 'source_id': source_id, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): url = self.key data = self.data html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',id='right-text_d') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@id='right-title_d']//text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'佛山', 'title': title, 'content': content, 'pubtime': data.get('pubtime', pubtime), 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'fsjsjd', 'publisher': u'广东佛山质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), '---',crawl_data['pubtime'] # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td',id='td_news_content') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='content-title']/div/text()" xp_putime = "//tr/td[@class='bottom-line-gray']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', 'city': u'广州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'gzq', 'publisher': u'广东广州质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',['article-box','files']) content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='article']/h2/text()|//h3/text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'湖北', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hbzljd', 'publisher': u'湖北质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): CatID = self.key category_data = extract_category(self) page = 1 page_count = 1 while page <= page_count: jsons = self.get_response(CatID, page) if page == 1: page_count = self.get_page_count(jsons) for goods in jsons['ProductListItems']: source_id = goods["Code"] task_data = self.has_goods(source_id) if task_data: crawl_data = { "id": task_data["uuid"], "title": goods["Title"], "price": goods["Price"]["CurrentPrice"], "source_id": source_id, "source": self.data["source"], "status": task_data["status"], "brand": task_data["brand"], "version": task_data["version"], "series": task_data["series"], "comment": { "is_Bbc": task_data["isBbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcBasicModel(crawl_data) export(model) else: detail_data = { "priorcategory": self.data["priorcategory"], } Scheduler.schedule(DetailCrawler.type, key=source_id, data=detail_data) page += 1
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: print "get_page fail" return {} for i in range(totalpage): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) try: appraise = json['appraiseArray'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get appraise fail" for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) # print commentid # print summary.encode('utf-8') comment_data = { 'ecid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_id': item['appraiseName'], 'content': item['summary'] } comment_data.update(category_data) model = EcCommentModel(comment_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td', 'conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): data = self.data url = self.key html_stream = _get_url(url) title = get_titles(html_stream) pubtime = local2utc(get_publish_times(html_stream)) soup = Readability(html_stream.text, url) content = soup.content # soup = HandleContent.get_BScontext(html_stream) comment = {} try: text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) except: content = '' pass # comment['key'] = data.get('key','') # comment['count'] = data.get('count','') crawl_data = {} date = new_time() crawl_data = { 'url': url, 'province': data.get('province'), 'city': data.get('city', u''), 'district': data.get('district', u''), 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': self.data["source"], 'publisher': data.get('publisher', u''), 'source_type': data.get('type'), 'comment': comment, } if comment['content']: model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', ['article-box', 'files']) content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='article']/h2/text()|//h3/text()" xp_putime = "//div[@class='article']/p[@class='info']/span/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'湖北', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hbzljd', 'publisher': u'湖北质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print comment['content'].encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', 'Custom_UnionStyle') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@id='cTitle']/text()" xp_putime = "//tr/td[@align='center']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'广东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'gdqts', 'publisher': u'广东质监局', 'source_type': u'质监局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print title.encode('utf-8'), pubtime # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','TRS_Editor') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@align='center']/h1/text()" xp_putime = "//div[@class='xj2']/text()" # xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'全国', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'aqsiq', 'publisher': u'国家质量监督检验检疫总局', 'source_type': u'国家质量监督检验检疫总局', # 'origin_source': u'福建质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print '===',pubtime,title.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): category_data = extract_category(self) page_size = self.get_page_size(self.key) page = 1 while page <= page_size: json_data = ProcessData.get_json_data(self.get_url(self.key, page)) reviews = json_data.get("commodityReviews", []) if not reviews: return for review in reviews: crawl_data = { "comment_id": self.get_comment_id(review), "content": review["content"], "tags": self.get_tags(review), "show_pic": self.get_show_pic(review), "pubtime": self.get_pubtime(review), "score": float(review["qualityStar"]), "useful": int(review["usefulCnt"]), "reply": 1 if review.get("replyInfo", {}) else 0, "user_name": review.get("userInfo", {}).get("nickName", ""), "eid": self.data["uuid"], "brand": self.data["brand"], "version": self.data["version"], "series": self.data["series"], "source": self.data["source"], "source_id": self.key, "status": self.data["status"], "comment": { "is_Bbc": self.data["is_Bbc"], }, } crawl_data.update(category_data) crawl_data.update(get_ctime()) model = EcCommentModel(crawl_data) export(model) page += 1
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('span', 'ny') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='dhz']/span/text()" xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()" # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hzqts', 'publisher': u'浙江杭州质监局', 'source_type': u'质监局', # 'origin_source': u'浙江杭州质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('span','ny') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td[@class='dhz']/span/text()" xp_putime = "//td/table/tbody/tr/td[@align='center']/span/text()" # xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) # author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'hzqts', 'publisher': u'浙江杭州质监局', 'source_type': u'质监局', # 'origin_source': u'浙江杭州质监局', # 'author': author, 'type': u'文章', 'comment': comment, } # print content.encode('utf-8') model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('li','show_con') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//ul/li[@class='show_title']/text()" xp_putime = "//ul/li[@class='show_date']/text()" xp_author = "//ul/li[@class='show_date']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'江西', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'jxzj', 'publisher': u'江西质监局', 'source_type': u'质监局', # 'origin_source': u'江西质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td','conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): # id号 ids = self.data['uuid'] # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1" # source_id="" # 商品url信息 url = self.key print "url:" + url source = "amazon" category_data = extract_category(self) # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取商品的详细信息 prodDetails = html.xpath("//div[@id='prodDetails']") if len(prodDetails) == 0: # 获取模版也具有基本信息的数据 detailed = getDetailedGoods( type=self.type, key=self.key, data=self.data ).crawlHtml(html) else: # 打印商品样式 style = prodDetails[0].xpath("div[@class='disclaim']/strong") # print style[0].text # 获取具体商品信息 goodinfo = prodDetails[0].xpath( "div[@class='wrapper CNlocale']//table/tbody/tr") # 商品 summary = {} ecbrands = "" ecnames = "" introduce = {} for info in goodinfo: # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text if info.xpath("td[@class='label']") != []: if info.xpath("td[@class='label']")[0].text == "用户评分": summary[info.xpath("td[@class='label']")[0].text] = info.xpath("td[@class='value']")[ 0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] elif info.xpath("td[@class='label']")[0].text.strip() == "品牌": ecbrands = info.xpath( "td[@class='value']")[0].text.strip() else: summary[info.xpath("td[@class='label']")[0].text] = info.xpath( "td[@class='value']")[0].text.strip() # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip() # 存入cassandra中 crawl_data = { 'id': ids, 'source': source, 'source_id': url, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): # 获取key 信息 # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071" keyid = self.key source = "amazon" score = 0 # 评分 # 获取原始分类 category_data = extract_category(self) # priorcategory priorcategory = self.data["priorcategory"] presentcategory = self.data["presentcategory"] count = getPageSize(self.get_url(keyid, 1)) # 页数初始值为3 page = 1 # 从第一页开始 content = "//div[@id='mainResults']/div" while page <= count: # 获取url信息 url = self.get_url(keyid, page) # print url # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # self.logger.info("执行页面:"+url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取整个商品的某一个商品的选项,返回的是一个列表 itempath = html.xpath(content) if itempath != None and itempath != []: # print itempath for item in itempath: title = item.xpath("h3[@class='newaps']/a") # crawl_data=[] #存储数据 # jg=item.xpath("") # 价格 pric = item.xpath( "ul[@class='rsltGridList grey']/li[@class='newp']/div") if pric == None: pric = item.xpath("ul/li[@class='newp']/div") # 商品评分 socreitmem = item.xpath( "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a") if socreitmem != []: scoreinfo = socreitmem[0].get('alt') if scoreinfo != None: score = float(scoreinfo[2:-1]) for t in title: # 获取商品的标题和url original_price = u"¥0.00" if pric == None or pric == []: price = u"¥0.00" else: try: price = pric[0].xpath("a/span")[0].text except: print url print "出错价格" + pric if pric != None and pric != [] and pric[0].xpath("a/del") != []: # 有原价 original_price = pric[0].xpath("a/del")[0].text else: # 如果没有原价,那就可以现价一样 original_price = price # i+=1 # 把信息存储到mongodb中 data = { 'priorcategory': priorcategory, 'presentcategory': presentcategory } if price != None and price.strip() != '' and pric != [] and pric[0] != '': # self.logger.info("价格:"+price) # 把信息存储到cassandra中 try: float(price.strip()[1:].replace(",", "")) # float(original_price.strip()[1:].replace(",","") except: self.logger.error("错误price:" + price) self.logger.error("错误price:" + original_price) crawl_data = { # 'id': uuid.uuid1(), 'source_id': t.get("href"), 'source': source, 'summary': {}, 'title': t.xpath("span")[0].text, 'adword': '', 'price': float(price.strip()[1:].replace(",", "")), 'original_price': float(original_price.strip()[1:].replace(",", "")), 'score': 0 } crawl_data.update(category_data) # 保存到cassandra数据库中category_data model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] # print "执行存储cassandra...." Scheduler.schedule( DetailCrawler.type, key=t.get("href"), data=data) Scheduler.schedule( CommentCrawler.type, key=t.get("href"), data=data) # print repr(json.dumps(crawl_data)) page += 1
def crawl(self): #商品id, 需要获取 goodid = self.data['uuid'] # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1" source = "amazon" url = self.key source_id = url category_data = extract_category(self) count = getCommSize(self.get_url(url, 1)) # 页数初始值为3 page = 1 # 从第一页开始 while page <= count: newurl = self.get_url(url, page) print newurl # productReviews # 获取该url的流信息 html_stream = ProcessData.get_web_data(newurl) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取评论区 comment = html.xpath("//table[@id='productReviews']//tr/td/div") for comitem in comment: # None # 评论内容 item = comitem.xpath("div[@class='reviewText']//text()") # 评分 scoreitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span/span/span") # 发布时间 pubtimeitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr" ) # 用户的链接地址 user_iditem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a" ) # 有用信息 usefulitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']") oninfo = "" for i in item: oninfo += i # 有用和无用信息 if usefulitem != None and usefulitem != []: tmpuseful = usefulitem[0].text.strip() else: tmpuseful = "0" if tmpuseful == "": tmpuseful = "0" elif tmpuseful != "0": tmpuseful = tmpuseful[0:tmpuseful.index("/")] # 日期 pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d') if pubtimeitem != None and pubtimeitem != []: pubtim = datetime.strptime( pubtimeitem[0].text.replace("年", "-").replace( "月", "-").replace("日", ""), '%Y-%m-%d') # 把日期的字符串类型,转换成日期类型 sorce = "0.0" if scoreitem != None and scoreitem != []: sorce = scoreitem[0].text[2:-1].strip() # print "评分:"+sorce # print user_iditem userid = '' if user_iditem != None and user_iditem != []: userid = str(user_iditem[0].get("href")) comment_data = { "ecid": goodid, "source_id": source_id, "source": source, "comment_id": "", "pubtime": pubtim, "buytime": pubtim, "score": float(sorce), "user_id": userid, "useful": int(tmpuseful), 'reply': 0, "content": oninfo.strip() } # print comment_data # 把原始和现有分类存储到数据库中 comment_data.update(category_data) model = EcCommentModel(comment_data) export(model) page += 1
def crawl(self): key = str(self.key) data = self.data homepage = "http://api.weibo.cn/2/cardlist?\ gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\ wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\ v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\ uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\ extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\ pos=1_-1&wm=3333_2001&containerid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&\ fid=" + key + "_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\ sourcetype=page&lcardid=user&page=1" # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\ # wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\ # v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\ # 2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\ # type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\ # count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\ # uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\ # c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1" homepage = clear_space(homepage) html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) cards = json_stream['cards'] for item in cards: scheme = re.search(r'=(.+?)$', item.get('scheme', '')) scheme = scheme.group(1) if scheme else '' url = "http://weibo.com/%s/%s?type=comment" % (data.get( 'id', ''), scheme) item = item.get('mblog', {}) item = item.get('retweeted_status', item) text = item.get('text', '') title = re.search(ur'【(.+?)】', text) title = title.group(1) if title else '' if not title: title = re.search(ur'#(.+?)#', text) title = title.group(1) if title else text[0:20] + '...' subtitle = re.search(ur'#(.+?)#', text) subtitle = subtitle.group(1) if subtitle else '' pubtime = item.get('created_at', '') pubtime = HandleContent.strformat(str(pubtime)) reposts_count = item.get('reposts_count', '') comments_count = item.get('comments_count', '') attitudes_count = item.get('attitudes_count', '') thumbnail_pic = item.get('thumbnail_pic', '') bmiddle_pic = item.get('bmiddle_pic', '') original_pic = item.get('original_pic', '') mid = item.get('mid', '') author = item.get('user', {}).get('name', '') comment = {} comment = { 'reposts_count': str(reposts_count), 'attitudes_count': str(attitudes_count), 'comments_count': str(comments_count) } crawl_data = {} subtitles = [] subtitles.append(subtitle) date = new_time() crawl_data = { 'province': self.data.get('province', ''), 'city': self.data.get('city', ''), 'district': self.data.get('district', ''), 'url': url, 'title': title, 'subtitle': subtitles, 'content': text, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'weibo', 'publisher': self.data.get('publisher', ''), 'author': author, 'origin_source': u'新浪微博', 'type': u'微博', 'comment': comment } model = WeiboArticleModel(crawl_data) if export(model): againt_data = {} againt_data = { 'wid': model['id'], 'type': u'微博', 'expire': date.get('crtime_int') / 1000000 + 604800, } Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data, reset=True, interval=21600) else: pass
def crawl(self): # 获取key 信息 # keyid="/%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91/b?ie=UTF8&node=106200071" keyid = self.key source = "amazon" score = 0 # 评分 # 获取原始分类 category_data = extract_category(self) # priorcategory priorcategory = self.data["priorcategory"] presentcategory = self.data["presentcategory"] count = getPageSize(self.get_url(keyid, 1)) # 页数初始值为3 page = 1 # 从第一页开始 content = "//div[@id='mainResults']/div" while page <= count: # 获取url信息 url = self.get_url(keyid, page) # print url # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # self.logger.info("执行页面:"+url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取整个商品的某一个商品的选项,返回的是一个列表 itempath = html.xpath(content) if itempath != None and itempath != []: # print itempath for item in itempath: title = item.xpath("h3[@class='newaps']/a") # crawl_data=[] #存储数据 # jg=item.xpath("") # 价格 pric = item.xpath( "ul[@class='rsltGridList grey']/li[@class='newp']/div") if pric == None: pric = item.xpath("ul/li[@class='newp']/div") # 商品评分 socreitmem = item.xpath( "ul[@class='rsltGridList grey']/li[@class='rvw']/span/span/a" ) if socreitmem != []: scoreinfo = socreitmem[0].get('alt') if scoreinfo != None: score = float(scoreinfo[2:-1]) for t in title: # 获取商品的标题和url original_price = u"¥0.00" if pric == None or pric == []: price = u"¥0.00" else: try: price = pric[0].xpath("a/span")[0].text except: print url print "出错价格" + pric if pric != None and pric != [] and pric[0].xpath( "a/del") != []: # 有原价 original_price = pric[0].xpath("a/del")[0].text else: # 如果没有原价,那就可以现价一样 original_price = price # i+=1 # 把信息存储到mongodb中 data = { 'priorcategory': priorcategory, 'presentcategory': presentcategory } if price != None and price.strip( ) != '' and pric != [] and pric[0] != '': # self.logger.info("价格:"+price) # 把信息存储到cassandra中 try: float(price.strip()[1:].replace(",", "")) # float(original_price.strip()[1:].replace(",","") except: self.logger.error("错误price:" + price) self.logger.error("错误price:" + original_price) crawl_data = { # 'id': uuid.uuid1(), 'source_id': t.get("href"), 'source': source, 'summary': {}, 'title': t.xpath("span")[0].text, 'adword': '', 'price': float(price.strip()[1:].replace(",", "")), 'original_price': float(original_price.strip()[1:].replace( ",", "")), 'score': 0 } crawl_data.update(category_data) # 保存到cassandra数据库中category_data model = EcBasicModel(crawl_data) export(model) data["uuid"] = model["id"] # print "执行存储cassandra...." Scheduler.schedule(DetailCrawler.type, key=t.get("href"), data=data) Scheduler.schedule(CommentCrawler.type, key=t.get("href"), data=data) # print repr(json.dumps(crawl_data)) page += 1
def crawl(self): key = str(self.key) data = self.data homepage = "http://api.weibo.cn/2/cardlist?\ gsid=_2A254IZdKDeTxGeRM7lUR8CnKyT2IHXVZdq2CrDV6PUJbrdAKLUf7kWptw4_No8F1OjQMCarBH4hZxZcrwA..&\ wm=3333_2001&i=27bd163&b=1&from=1051293010&c=iphone&v_p=18&skin=default&\ v_f=1&s=d2672a12&lang=zh_CN&ua=iPhone7,2__weibo__5.1.2__iphone__os8.1.3&\ uicode=10000198&featurecode=10000085&luicode=10000003&count=20&\ extparam=100103type=1&cuid=2257007621&sid=t_wap_ios&category=1&\ pos=1_-1&wm=3333_2001&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&\ fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&lfid=100103type%3D1&\ sourcetype=page&lcardid=user&page=1" # homepage = "http://api.weibo.cn/2/guest/cardlist?gsid=4wMJ47123kZuG0fKGxlRC15McKa50&uid=1001503246310&\ # wm=3333_2001&i=27bd163&b=0&from=1052093010&checktoken=c54259b09129d101b9669b5d93a04c0e&c=iphone&\ # v_p=18&skin=default&v_f=1&s=8a12fc6c&did=38d63734cc7427ebb2cb77612c1948cf&lang=zh_CN&ua=iPhone7,\ # 2__weibo__5.2.0__iphone__os8.2&uid=1001503246310&extparam=100103\ # type%3D1%26q%3D%E5%8C%97%E4%BA%AC%E5%AE%89%E7%9B%91%26t%3D0%26sid%3Dt_wap_ios%26category%3D1%26pos%3D1_-1%26wm%3D3333_2001&\ # count=20&luicode=10000003&containerid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&featurecode=10000085&\ # uicode=10000198&fid="+key+"_-_WEIBO_SECOND_PROFILE_WEIBO&checktoken=\ # c54259b09129d101b9669b5d93a04c0e&did=38d63734cc7427ebb2cb77612c1948cf&page=1" homepage = clear_space(homepage) html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) cards = json_stream['cards'] for item in cards: scheme = re.search(r'=(.+?)$', item.get('scheme','')) scheme = scheme.group(1) if scheme else '' url = "http://weibo.com/%s/%s?type=comment"%(data.get('id', ''), scheme) item = item.get('mblog',{}) item = item.get('retweeted_status',item) text = item.get('text','') title = re.search(ur'【(.+?)】', text) title = title.group(1) if title else '' if not title: title = re.search(ur'#(.+?)#', text) title = title.group(1) if title else text[0:20]+'...' subtitle = re.search(ur'#(.+?)#', text) subtitle = subtitle.group(1) if subtitle else '' pubtime = item.get('created_at', '') pubtime = HandleContent.strformat(str(pubtime)) reposts_count = item.get('reposts_count', '') comments_count = item.get('comments_count', '') attitudes_count = item.get('attitudes_count', '') thumbnail_pic = item.get('thumbnail_pic', '') bmiddle_pic = item.get('bmiddle_pic', '') original_pic = item.get('original_pic', '') mid = item.get('mid', '') author = item.get('user',{}).get('name','') comment = {} comment = { 'reposts_count': str(reposts_count), 'attitudes_count': str(attitudes_count), 'comments_count': str(comments_count) } crawl_data = {} subtitles = [] subtitles.append(subtitle) date = new_time() crawl_data = { 'province': self.data.get('province',''), 'city': self.data.get('city',''), 'district': self.data.get('district',''), 'url': url, 'title': title, 'subtitle': subtitles, 'content': text, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'weibo', 'publisher': self.data.get('publisher',''), 'author': author, 'origin_source': u'新浪微博', 'type': u'微博', 'comment': comment } model = WeiboArticleModel(crawl_data) if export(model): againt_data = {} againt_data = { 'wid': model['id'], 'type': u'微博', 'expire': date.get('crtime_int')/1000000 + 604800, } Scheduler.schedule(AgainCrawler.type, key=mid, data=againt_data, reset=True, interval=21600) else: pass
def crawl(self): #商品id, 需要获取 goodid = self.data['uuid'] # goodid="7ebd0a6a-7b5c-11e4-85d7-00e06668ddd1" source = "amazon" url = self.key source_id = url category_data = extract_category(self) count = getCommSize(self.get_url(url, 1)) # 页数初始值为3 page = 1 # 从第一页开始 while page <= count: newurl = self.get_url(url, page) print newurl # productReviews # 获取该url的流信息 html_stream = ProcessData.get_web_data(newurl) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取评论区 comment = html.xpath("//table[@id='productReviews']//tr/td/div") for comitem in comment: # None # 评论内容 item = comitem.xpath("div[@class='reviewText']//text()") # 评分 scoreitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span/span/span") # 发布时间 pubtimeitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/span[@style='vertical-align:middle;']/nobr") # 用户的链接地址 user_iditem = comitem.xpath( "div[@style='margin-bottom:0.5em;']/div/div[@style='float:left;']/a") # 有用信息 usefulitem = comitem.xpath( "div[@style='margin-bottom:0.5em;']") oninfo = "" for i in item: oninfo += i # 有用和无用信息 if usefulitem != None and usefulitem != []: tmpuseful = usefulitem[0].text.strip() else: tmpuseful = "0" if tmpuseful == "": tmpuseful = "0" elif tmpuseful != "0": tmpuseful = tmpuseful[0:tmpuseful.index("/")] # 日期 pubtim = datetime.strptime("1971-01-1", '%Y-%m-%d') if pubtimeitem != None and pubtimeitem != []: pubtim = datetime.strptime(pubtimeitem[0].text.replace( "年", "-").replace("月", "-").replace("日", ""), '%Y-%m-%d') # 把日期的字符串类型,转换成日期类型 sorce = "0.0" if scoreitem != None and scoreitem != []: sorce = scoreitem[0].text[2:-1].strip() # print "评分:"+sorce # print user_iditem userid = '' if user_iditem != None and user_iditem != []: userid = str(user_iditem[0].get("href")) comment_data = { "ecid": goodid, "source_id": source_id, "source": source, "comment_id": "", "pubtime": pubtim, "buytime": pubtim, "score": float(sorce), "user_id": userid, "useful": int(tmpuseful), 'reply': 0, "content": oninfo.strip() } # print comment_data # 把原始和现有分类存储到数据库中 comment_data.update(category_data) model = EcCommentModel(comment_data) export(model) page += 1
def crawl(self): # id号 ids = self.data['uuid'] # ids="1dcfa11e-7acf-11e4-b0cc-00e06668ddd1" # source_id="" # 商品url信息 url = self.key print "url:" + url source = "amazon" category_data = extract_category(self) # 获取该url的流信息 html_stream = ProcessData.get_web_data(url) # 获取商品列表的html 信息 html = etree.HTML(html_stream.text) # 获取商品的详细信息 prodDetails = html.xpath("//div[@id='prodDetails']") if len(prodDetails) == 0: # 获取模版也具有基本信息的数据 detailed = getDetailedGoods(type=self.type, key=self.key, data=self.data).crawlHtml(html) else: # 打印商品样式 style = prodDetails[0].xpath("div[@class='disclaim']/strong") # print style[0].text # 获取具体商品信息 goodinfo = prodDetails[0].xpath( "div[@class='wrapper CNlocale']//table/tbody/tr") # 商品 summary = {} ecbrands = "" ecnames = "" introduce = {} for info in goodinfo: # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text if info.xpath("td[@class='label']") != []: if info.xpath("td[@class='label']")[0].text == "用户评分": summary[info.xpath( "td[@class='label']")[0].text] = info.xpath( "td[@class='value']")[0].xpath( "//div[@id='averageCustomerReviewRating']" )[0].text.strip()[2:-1] # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].xpath("//div[@id='averageCustomerReviewRating']")[0].text.strip()[2:-1] elif info.xpath( "td[@class='label']")[0].text.strip() == "品牌": ecbrands = info.xpath( "td[@class='value']")[0].text.strip() else: summary[info.xpath("td[@class='label']") [0].text] = info.xpath( "td[@class='value']")[0].text.strip() # print # info.xpath("td[@class='label']")[0].text,info.xpath("td[@class='value']")[0].text.strip() # 存入cassandra中 crawl_data = { 'id': ids, 'source': source, 'source_id': url, 'summary': summary, 'introduce': introduce, 'name': ecnames, 'brand': ecbrands } crawl_data.update(category_data) # print crawl_data model = EcDetailModel(crawl_data) export(model)
def crawl(self): skulist = [] goodsNo = str(self.key) category_data = extract_category(self) url = self.get_detail_url(goodsNo) html = ProcessData.get_web_data(url) tree = etree.HTML(html.text) xpath = { "introduce": "//div[@class='guigecanshu']/text()", "summary": "//ul[@id='prd_data']/li[2]/ul/li/span/text()", # "number": "//span[@class='fr ccc']/text()" } summary = self.parse_summary(tree, xpath["summary"]) introduce = self.parse_intr(tree, xpath["introduce"]) # number = self.parse_number(tree, xpath["number"]) version = get_version(summary, introduce) series = get_series(summary, introduce) brand = get_brand(summary, introduce) json = ProcessData.get_json_data(self.get_basic_url(goodsNo)) isBbc_str = json["isBbc"] isBbc = "Y" if isBbc_str == "Y" or isBbc_str == "y" else "N" status_str = json["onSale"] status = 0 if status_str == "N" or status_str == "n" else 1 skulist = json['skuList'] for sku in skulist: ecname = sku['skuName'] ecimglist = sku['skuSourceImgUrl'] detail_data = { 'source': self.data.get('source'), 'source_id': goodsNo, 'summary': summary, 'introduce': introduce, 'name': ecname, 'images': ecimglist, 'status': status, 'brand': brand, 'version': version, 'series': series, 'comment': { 'is_Bbc': isBbc, 'skuID': self.data['skuID'], }, } detail_data.update(category_data) detail_data.update(get_ctime()) model = EcDetailModel(detail_data) export(model) comment_data = { 'uuid': model["id"], 'brand': brand, 'version': version, 'series': series, 'is_Bbc': isBbc, 'status': status, 'priorcategory': self.data['priorcategory'], 'skuID': self.data['skuID'], } Scheduler.schedule(CommentCrawler.type, key=goodsNo, data=comment_data)