Esempio n. 1
0
    def dealmoon(self,source):#OK
        
        print "dealmoon"
        
        item_collection_name = "dealmoon_item"
        mongodbItem = first_mongodb[self.database][item_collection_name]
        
        #清除数据
#         mongodbItem.remove()
#         print "remove over" 

        clr = Color()   #CMD终端分颜色打�?
        
        source_url = source['href']
        source_name = source['item_name']
        clr.print_red_text(source_url)
        clr.print_red_text(source_name)
#         if source['item_id'] < 17 : return
#         if source['item_id'] == 17 :
#             source_url = "http://cn.dealmoon.com/Everything-Else/39"
#         print source_name
#         print source_url 
        
#         item = ShopItem()
#         item.categoryid = source['id']
        
        while 1 :
            
#             print source_url 
            clr.print_red_text(source_url)
            
            selector =  loadHtmlSelector(source_url, headers=None)
            if selector is None : return
            lists = selector.findAll("div", {"class":"mlist"})
            item_list = []
            
            for list in lists :
                
                item = Item()
                item.categoryid = source['item_id']
                
                #条目ID
                item.itemid = int (list.attrs['data-id'])
                print item.itemid
                
                #更新,直接跳到下�?个分�?
                item_num = mongodbItem.find({"itemid":item.itemid}).count()
                if item_num != 0 : return
#                 if item_num != 0 : continue     #暂停,继续爬�?
                
                #条目标题
                if list.find("h2") :
                    item.name = list.find("h2").find("span", {"class":None}).get_text().strip()
                else :
                    item.name = list.find("h1").find("span", {"class":None}).get_text().strip()
#                 print item.name
                 
                #商品图片
                item.image = list.find("div", {"class":"mpic"}).find("img", {"alt":True}).attrs['src']       
                print item.image
                 
                #时间
                if not list.find("div", {"class":"date"}) : continue
                updatetimeitem = list.find("div", {"class":"date"}).get_text().strip()
                if "分钟".decode('utf-8') in updatetimeitem :     #时间格式为几分钟�?
                    updatetime =  datetime.datetime.now() - datetime.timedelta(minutes= int (filter(lambda x:x.isdigit(),updatetimeitem)))
                    item.updatetime = time.mktime(updatetime.timetuple())
                elif "小时".decode('utf-8') in updatetimeitem :   #时间格式为几小时�?
                    updatetime =  datetime.datetime.now() - datetime.timedelta(hours= int (filter(lambda x:x.isdigit(),updatetimeitem)))
                    item.updatetime = time.mktime(updatetime.timetuple())
                else :   #时间格式为几天前
                    updatetime = datetime.datetime.today() - datetime.timedelta(days= int (filter(lambda x:x.isdigit(),updatetimeitem)))
                    item.updatetime = time.mktime(updatetime.timetuple())
                print item.updatetime
                 
                #价格
                if list.find("h2") :
                    item.price = list.find("h2").find("a").find("span", {"class":"notice_item"}).get_text().strip()
                else :
                    item.price = list.find("h1").find("a").find("span", {"class":"notice_item"}).get_text().strip()       
#                 print item.price
                 
                #商城
                if list.find("h2") :
                    articleurl = list.find("h2").find("a").attrs['href']
                else :
                    articleurl = list.find("h1").find("a").attrs['href']
                    
                status = urllib.urlopen(articleurl).code
                if status == 404 :  #有个别链接失�?
                    clr.print_red_text("return 404 error")
#                     print "return 404 error"
                    continue
                
                articleselector = loadHtmlSelector(articleurl, headers=None)
                originmall_text = articleselector.find("div", {"class":"gn_line"})
                if originmall_text :
                    originmall_text = originmall_text.find("a", {"style":"color:#003399;", "trk":None})
                    if originmall_text : 
#                 originmall = originmall_text[4:len(originmall_text)-6]      #提取商城名字
                        originmall = originmall_text.get_text().strip().replace("来自".decode('utf-8'),"").replace("的折扣".decode('utf-8'),"") #提取商城名字
                        originmallurl = originmall_text.attrs['href']
                    else :
                        originmall = ''
                        originmallurl = ''
                else :
                    originmall = ''
                    originmallurl = ''
                print originmall
                  
                #购买链接
                href = articleselector.find("div", {"class":"mpic"})
                if href :
                    item.href = "http://cn.dealmoon.com" + str (href.find("div", {"class":"buy"}).find("a", {"trk":True}).attrs['href'])
#                 if href :
#                     item.href = "http://cn.dealmoon.com" + str (href.attrs['href'])
                else :
                    item.href = ""
                    continue        #过滤没有购买链接的条�?
                print item.href
                
                #推荐�?
                goodcount = list.find("div", {"class":"minfo"}).find("span", {"class":"like_btn"})
                if goodcount :
                    goodcount = goodcount.find("em").get_text()
                    goodcountnum = int (filter(lambda x : x.isdigit(),goodcount))
                else :
                    goodcountnum = 0
                print "goodcountnum is %d" %goodcountnum        
                  
                #评论�? 
                commentcount = list.attrs['cmtcn']
                commentcountnum = int (commentcount)
                print "commentcountnum is %d" %commentcountnum 
                
                #收藏�?
                favcount = list.find("div", {"class":"minfo"}).find("span", {"class":"fav_btn"}).find("em").attrs['num']
                favcountnum = int (favcount)
                print "favcountnum is %d" %favcountnum
                
                dict = item.createItemdic({"articleurl":articleurl, "good_count":goodcountnum, "comment_count":commentcountnum, "fav_conut":favcountnum, "originmall":originmall, "originmallurl":originmallurl})
                print dict 
                
                #判断是否已经爬取
                num = mongodbItem.find({"itemid":item.itemid}).count()
                if num == 0 :
#                     item_list.append(dict)
                    mongodbItem.insert(dict)    #�?次插入一�?
                    clr.print_red_text("insert sucessfully")
#                     print "insert sucessfully"
                else :
                    clr.print_red_text("item exits, num is %s"  % num)
#                     print ("item exits, num is %s"  % num)
                    continue
             
            #�?次插入整页所有条�?
#             print item_list
#             if len(item_list) != 0 :  
#                 self.mongodbitemlist.insert(item_list)  
#                 print "insert successfully"   
             
            #下一�?
            next_page = selector.find("div", {"class":"pages"}).find("a", {"class":"next_link"})
            if next_page :
                source_url = next_page.attrs['href']
            else :
                clr.print_red_text("exit")
#                 print "exit"
                break
                   
        pass
Esempio n. 2
0
    def smzdm_yh(self, source):
        
        print "smzdm_yh"
        
        item_collection_name = "smzdm_yh_item"
        mongodbItem = first_mongodb[self.database][item_collection_name]
        
        clr = Color()   #CMD终端分颜色打�?
        
        #清除数据库数�?
#         mongodbItem.remove()
#         print "remove over"
        source_url = source['href']
#         if source['item_id'] == 13 :
#             source_url = "http://www.smzdm.com/youhui/fenlei/jiajujiazhuang/p17"
        source_name = source['item_name']
        clr.print_red_text(source_name)
        clr.print_red_text(source_url)
#         print source_name
#         print source_url 
        
        while 1 :
              
            print source_url 
                  
            selector = loadHtmlSelector(source_url, headers=None)
            if selector is None : return
            divs = selector.findAll("div", articleid=True)
            item_list = []
            for div in divs :
                
                item = Item()
                item.categoryid = source['item_id']
                   
                if div.find("div", {"class":"listTitle"}).find("span", {"class":"icon"}) : continue     #过滤过期条目
                          
                item.itemid = int (div.attrs['articleid'].split("_")[-1])
                print item.itemid
                
                #更新,直接跳到下�?个分�?
                item_num = mongodbItem.find({"itemid":item.itemid}).count()
                if item_num != 0 : 
                    clr.print_red_text("%s update over " %source_name)
#                     print "%s update over " %source_name
                    return
#                 if item_num != 0 : continue     #暂停,继续爬�?
                      
                item.updatetime = int (div.attrs['timesort'])
                updatetime = time.asctime(time.localtime(item.updatetime))
                article_time = datetime.datetime.strptime(updatetime,"%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d %H:%M:%S %A")
                print item.updatetime
                print article_time
                      
                item.name = div.find("h2", {"class":"itemName"}).find("a").get_text().strip()
#                 print item.name
                    
#                 if "优惠�?".decode('utf-8') in item.name : continue 
#                 if "红包".decode('utf-8') in item.name : continue
#                 if "免费�?".decode('utf-8') in item.name : continue
#                 if "蚊子�?".decode('utf-8') in item.name : continue
#                 if "消费提示".decode('utf-8') in item.name : continue #过滤非商品条�?
#                 if "促销".decode('utf-8') in item.name : continue 
#                 if "活动".decode('utf-8') in item.name : continue
#                 if "�?么�?�得�?".decode('utf-8') in item.name : continue 
#                 if "公告".decode('utf-8') in item.name : continue 
#                 if "打车".decode('utf-8') in item.name : continue 
#                 if "公交".decode('utf-8') in item.name : continue 
#                 if "点券".decode('utf-8') in item.name : continue
#                 if "预告".decode('utf-8') in item.name : continue
#                 if "银行".decode('utf-8') in item.name : continue
#                 if "公益".decode('utf-8') in item.name : continue 
#                 if "专享".decode('utf-8') in item.name : continue
#                 if "晒物".decode('utf-8') in item.name : continue
#                 if "专题".decode('utf-8') in item.name : continue  
#                 if "白菜".decode('utf-8') in item.name : continue
#                 if "电信".decode('utf-8') in item.name : continue
#                 if "话费".decode('utf-8') in item.name : continue
#                 if "迅雷".decode('utf-8') in item.name : continue
#                 if "网友投稿".decode('utf-8') in item.name : continue
                        
                item.image = div.find("img", alt=True)      #商品图片
                if item.image :
                    item.image = item.image.attrs['src']
                    print item.image
                else :
                    item.image = ""
                    continue
                
                #商品价格      
                item.price = div.find("h2", {"class":"itemName"}).find("span", {"class":"red"}).get_text()      
#                 if item.price == '' : continue 
#                 if "红包".decode('utf-8') in item.price : continue
#                 if  item.price != '' and not re.search(r'\d', item.price) : continue        #过滤价格中没有数字的条目
#                 print item.price
                
                #购买链接      
                item.href = div.find("div", {"class":"buy"})    
                if item.href :
                    item.href =item.href.find("a", {"target":"_blank"}).attrs['href']
                    if "baoxian" in item.href : continue    #过滤保险类条�?
                    if "baidu" in item.href : continue
                    print item.href
                else :             
                    item.href = ""
                    continue  
                
                #商城
                originmallitem = div.find("div", {"class":"botPart"}).find("a", {"class":"mall"})
                if originmallitem :
                    originmall = originmallitem.get_text()   
                    originmallurl = originmallitem.attrs['href']
                    print originmall
                else : 
                    originmall = ""
                    originmallurl = ""
                
                #推荐�?      
                itemelse = div.find("div", {"class":"lrBot"})
                goodcount = itemelse.find("a", {"class":"good"}).find("span", {"class":"scoreTotal"}).attrs['value']
                goodcountnum = int(goodcount)   #�?
                print "goodcountnum is %d" %goodcountnum
                
                #不推荐数     
                badcount = itemelse.find("a", {"class":"bad"}).find("span", {"class":"scoreTotal"}).attrs['value']
                badcountnum = int(badcount)    #不�??
                print "badcountnum is %d" %badcountnum
                
                #收藏�?     
                favcount = itemelse.find("a", {"title":"收藏"}).find("em").get_text()
                favcountnum = int(favcount)     #收藏
                print "favcountnum is %d" %favcountnum
                
                #评论�?     
                commentcount = itemelse.find("a", {"class":"comment"}).get_text()
                commentcountnum = int(commentcount)     #评论
                print "commentcountnum is %d" %commentcountnum
                
                #文章链接
                article_url =  div.find("h2", {"class":"itemName"}).find("a").attrs['href']  
                print article_url
                  
                item_dict = item.createItemdic({"article_url":article_url, "article_time":article_time, "good_count":goodcountnum, "bad_count":badcountnum, "fav_count":favcountnum, "comment_count":commentcountnum, "originmall":originmall, "originmallurl":originmallurl})
                print item_dict 
                 
                #判断是否已经爬取     
                item_num = mongodbItem.find({"itemid":item.itemid}).count()
                if item_num == 0 :
#                     item_list.append(item_dict)
                    mongodbItem.insert(item_dict)
                    print "insert successfully"
                else :
#                     mongodbItem.update({"itemid":item.itemid}, item_dict)
#                     print "update over"
                    print ("item exits, num is %s"  % item_num)
                    continue
            
            #�?次插入整页所有条�?          
#             print item_list     
#             if len(item_list) != 0 :  
#                 mongodbItem.insert(item_list)  #�?次插入一页所有条�?
#                 print "insert successfully"
             
            #下一�?     
            next_page = selector.find("ul", {"class":"pagination"}).find("li", {"class":"pagedown"})
            if next_page :
                source_url = next_page.find("a").attrs['href']
            else :
                print "exit"
                break
Esempio n. 3
0
    def smzdm_fx(self,source):
        
        print "smzdm_fx"
        
        item_collection_name = "smzdm_fx_item"
        mongodbItem = first_mongodb[self.database][item_collection_name]
        
        #清除数据库数�?
#         mongodbItem.remove()
#         print "remove over"
        clr = Color()   #CMD终端分颜色打�?
        
        source_url = source['href']
        source_name = source['item_name']
#         if source['item_id'] < 15 : return
#         if source['item_id'] == 15 :
#             source_url = "http://faxian.smzdm.com/fenlei/qitafenlei/p192"
        clr.print_red_text(source_url)
        clr.print_red_text(source_name)    
#         print source_name
#         print source_url 
        
        while 1 :
            
            print source_url
#             clr.print_red_text(source_url)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
            
            selector =  loadHtmlSelector(source_url, headers=None)
            if selector is None : return
            lists = selector.findAll("li", {"class":"list"})
#             print divs
            item_list = []
            for list in lists :
                
                item = Item()
                item.categoryid = source['item_id']      #分类ID  
                
                #条目ID
                item.itemid = int (list.attrs['articleid'].split("_")[-1])
#                 clr.print_blue_text(item.itemid)
                print item.itemid
                
                #更新,直接跳到下�?个分�?
                num = mongodbItem.find({"itemid":item.itemid}).count()
                if num != 0 : return    
#                 if num != 0 : 
#                     clr.print_yellow_text("item exits")
#                     continue  #暂停,继续爬�?
                
                #时间  
                item.updatetime = int (list.attrs['timesort'])
                updatetime = time.asctime(time.localtime(item.updatetime))
                article_time = datetime.datetime.strptime(updatetime,"%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d %H:%M:%S %A")
                print item.updatetime
                print article_time
                
                #条目名称  
                item.name = list.find("h2", {"class":"itemName"}).find("span", {"class":"black"}).get_text().strip()
#                 print item.name
                  
                if "优惠券".decode('utf-8') in item.name : continue    #过滤非商品条目
                if "活动".decode('utf-8') in item.name : continue
                if "专享".decode('utf-8') in item.name : continue
                  
                #商品图片  
                item.image = list.find("img", alt=True)     
                if item.image :
                    item.image = item.image.attrs['src']
                    print item.image
                else :
                    item.image = ""
                
                    continue
                #价格  
                item.price = list.find("h2", {"class":"itemName"}).find("span", {"class":"red"}).get_text()     
#                 if item.price == '' : continue 
                if "促销".decode('utf-8') in item.price : continue
                if "红包".decode('utf-8') in item.price : continue    #过滤非商品条�?
                if  item.price != '' and not re.search(r'\d', item.price) : continue        #过滤价格中没有数字的条目
#                 print item.price
                
                #购买链接  
                item.href = list.find("div", {"class":"item_buy_mall"}).find("a", {"class":"directLink"}).attrs['href']
                clr.print_blue_text(item.href)     
#                 print item.href
                
                #推荐�?  
                goodcount = list.find("div", {"class":"zan_fav_com"}).find("a", {"class":"zan"}).find("em").get_text()      #“�?��?�数
                goodcountnum = int(goodcount)
                print "goodcountnum is %d" %goodcountnum        
                
                #评论�?  
                commentcount = list.find("div", {"class":"zan_fav_com"}).find("a", {"class":"comment"}).get_text()      
                commentcountnum = int(commentcount)
                print "commentcountnum is %d" %commentcountnum
                
                #文章链接
                article_url =  list.find("h2", {"class":"itemName"}).find("a").attrs['href']  
#                 print article_url
                clr.print_blue_text(article_url)
                
                article_selector = loadHtmlSelector(article_url, headers=None)
                
                #商城
                originmall = article_selector.find("div", {"class":"article-meta-box"}).find("a", {"onclick":None})
                if originmall :
                    originmall = originmall.get_text()
                else :
                    originmall = ""
#                 print originmall
                
#                 content_item = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"item-box"})
#                 if content_item :
                #优惠力度
                youhui_content = article_selector.find("div", {"class":"item-box item-preferential"})
                if youhui_content :
                    youhui_content = youhui_content.find("div", {"class":"inner-block"})
                    if youhui_content :
                        youhui_content = youhui_content.find("p").get_text().replace("\t","").replace("\n", "").replace("\r", "").strip()
                    else :
                        youhui_content = ""
                    #爆料原文
                    baoliao_content = article_selector.find("div", {"class":"item-box item-preferential"}).find("div", {"class":"baoliao-block"})
                    if baoliao_content :
                        baoliao_content = baoliao_content.find("p").get_text().replace("\t","").replace("\n", "").replace("\r", "").strip()
                    else :
                        baoliao_content = ""
                else :
                    youhui_content = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"inner-block"}).get_text().replace("\t","").replace("\n", "").replace("\r", "").strip()
                    baoliao_content = ""
#                 print youhui_content
#                 print baoliao_content
                
                #商品介绍
                item_description = ""
                item_descriptions = article_selector.findAll("div", {"class":"item-box"})
                if item_descriptions :
                    description_count = 1
                    for description in item_descriptions :
                        if description_count == 2 :
                            item_description = description.find("div", {"class":"inner-block"})
                            if item_description :
                                item_description = item_description.find("p")
                                if item_description :
                                    item_description = item_description.get_text().replace("\t","").replace("\n", "").replace("\r", "").strip()
                                else :
                                    item_description = ""
                            else :
                                    item_description = ""
                        description_count += 1
#                 print item_description
#                 else :
#                     baoliao_content = article_selector.find("article", {"class":"article-details"}).find("div", {"class":"inner-block"}).find("p", {"itemprop":"description"}).get_text().replace("\t","").replace("\n", "").replace("\r", "").strip()
#                     youhui_content = ""
#                     item_description = ""
                    
                #不推荐数
                badcount = article_selector.find("div", {"class":"score_rate"}).find("span", {"id":"rating_unworthy_num"}).get_text().strip()
                badcountnum = int(badcount)
                print "badcountnum is %d" %badcountnum
                
                #收藏�?
                favcount = article_selector.find("div", {"class":"operate_box"}).find("div", {"class":"operate_icon"}).find("a", {"class":"fav"}).find("em").get_text()
                favcountnum = int(favcount)
                print favcountnum 
                  
                item_dict = item.createItemdic({"originmall":originmall, "baoliao_content":baoliao_content, "youhui_content":youhui_content, "item_description":item_description, "bad_count":badcountnum, "fav_count":favcountnum, "article_url":article_url, "article_time":article_time, "good_count":goodcountnum, "comment_count":commentcountnum})
                print item_dict 
#                 clr.print_green_text(item_dict)
                
                #判断是否已经爬取  
                num = mongodbItem.find({"itemid":item.itemid}).count()
                if num == 0 :
#                     item_list.append(item_dict)
                    mongodbItem.insert(item_dict)    #�?次插入一个条�?
                    clr.print_red_text("insert successfully")
#                     print "insert successfully"
                else :
#                     mongodbItem.update({"itemid":item.itemid}, item_dict)
#                     print "update over"
                    clr.print_yellow_text(("item exits, num is %s"  % num))
#                     print ("item exits, num is %s"  % num)
                    continue
            
            #�?次插入一页所有条�?      
#             print item_list
#             if len(item_list) != 0 :
#                 self.mongodbitemlist.insert(item_list) 
#                 print "insert successfully"  
              
            next_page = selector.find("ul", {"class":"pagination"}).find("li", {"class":"pagedown"})
            if next_page :
                source_url = next_page.find("a").attrs['href']
            else :
                print "exit"
                break
        clr.print_red_text("all done")