from tools.public import * import random headers = { 'Cookie': 'appver=1.5.0.75771;', 'Referer': 'http://music.163.com/' } payload = { 'id': '140330894', 'updateTime': -1 } url = 'http://music.163.com/api/playlist/detail' if __name__ == '__main__': spider = Spider() data = spider.req(headers=headers, params=payload).crawl(url, pattern='json') if data['code'] == 200 and data['result'] is not None: tracks = data['result']['tracks'] mCount = data['result']['trackCount'] index = random.randint(0, mCount-1) track = tracks[index] db = DB() select = db.one("SELECT * FROM music WHERE date=?", (getToday(),)) if select is None: insertData = (getToday(), track['name'], track['artists'][0]['name'], track['album']['picUrl'], track['mp3Url']) db.execute("INSERT INTO music VALUES (?, ?, ?, ?, ?)", insertData)
import sys sys.path.append('..') from tools.spider import Spider from tools.db import DB from tools.public import * from lxml import etree url = 'http://wufazhuce.com/' if __name__ == '__main__': spider = Spider() html = spider.crawl(url) selector = etree.HTML(html) url = selector.xpath('//div[@class="carousel-inner"]/div[@class="item active"]/a/img/@src') text = selector.xpath('//div[@class="carousel-inner"]/div[@class="item active"]/div[@class="fp-one-cita-wrapper"]/div[@class="fp-one-cita"]/a/text()') url = url[0] if len(url) == 1 else "" text = text[0] if len(text) == 1 else "" db = DB() select = db.one("SELECT * FROM article WHERE date=?", (getToday(),)) if select is None: insertData = (getToday(), url, text) db.execute("INSERT INTO article VALUES (?, ?, ?)", insertData)
import sys sys.path.append('..') from tools.db import DB db = DB() db.execute('create table article (date TEXT, pic TEXT, content TEXT)') db.execute( 'create table music (date TEXT, name TEXT, artist TEXT, pic TEXT, link TEXT)' ) db.execute( 'create table movie (date TEXT, name TEXT, pic TEXT, type TEXT, score INT, plot TEXT, link TEXT)' )
import sys sys.path.append('..') from tools.db import DB db = DB() db.execute('create table article (date TEXT, pic TEXT, content TEXT)') db.execute('create table music (date TEXT, name TEXT, artist TEXT, pic TEXT, link TEXT)') db.execute('create table movie (date TEXT, name TEXT, pic TEXT, type TEXT, score INT, plot TEXT, link TEXT)')
class SgPipeline(object): conn = None db = None def install(self, db='sg'): if self.conn == None: self.db = DB(db) def add_cate_goods_index(self, cate_id, goods_id): self.db.execute("INSERT INTO le_cate_goods_index SET cate_id=%s, goods_id=%s,weight=0",[cate_id, goods_id]) ''' name,url,oldImg,descOldImg,cate,price,originalPrice,countBought,ExpiryTime, highlight,condition,description,address,postCode,merchant,phone `last_modified`, `goods_id`, `uid`, `img`, `deal_img`, `display_order`, `img_w`, `img_h`, `desc_bigpic`, `bigpic`, `small_pic`, `desc_oldimg`, `oldimg`, `name`, `seo_title`, `url`, `currency`, `original_price`, `price`, `cate_id`, `source`, `addtime`, `expiry_time`, `uptime`, `website_id`, `store_id`, `isdeal`, `ispublish`,isshow`,`highlight`, `conditions`, `description`, `merchant`, `phone`, `address`, `city`, `country`, `post` ''' def process_item(self, item, spider): self.install(item['db']) Classifier = SimpleClassifier(item['db']) if type(item) != SgGoodsItem: return item if item['name'] == False or item['name'] == '': return item img = small_pic = big_pic = old_pic = '' if len(item['images']) > 0 : img = '/uploaded/' + item['images'][0].replace( 'original','thumb400') for src in item['images']: small_pic = small_pic + '/uploaded/'+ src.replace('original', 'thumb100') + '|' big_pic = big_pic + '/uploaded/'+ src +'|' small_pic = small_pic.strip('|') big_pic = big_pic.strip('|') if len(item['oldImg']) > 0: old_pic = '|'.join(item['oldImg']) for i in item: if type(item[i]) == unicode or type(item[i]) == str: item[i] = item[i].encode('utf-8') if item['goods']: goods_cate_id = 0 # 更新cate_id if item['goods']['cate_id'] < 1: classlist = Classifier.findCateAndTags(item['name'], 4) if classlist['cate']: goods_cate_id = classlist['cate'] self.add_cate_goods_index(classlist['cate'], item['goods']['goods_id']) for cate_id in classlist['cates']: self.add_cate_goods_index(cate_id, item['goods']['goods_id']) else: goods_cate_id = item['goods']['cate_id'] if item['goods']['price'] != item['price'] or item['goods']['original_price'] != item['originalPrice'] or item['goods']['name'] != item['name']: res = self.db.execute("UPDATE le_goods SET isshow=1,name=%s,price=%s,original_price=%s, uptime=%s,expiry_time=%s,site_id=%s,cate_id=%s WHERE goods_id=%s",[item['name'],item['price'],item['originalPrice'],int(time.time()),item['ExpiryTime'], item['site_id'],goods_cate_id,item['goods']['goods_id']]) else: res = self.db.execute("UPDATE le_goods SET isshow=1,uptime=%s,expiry_time=%s,site_id=%s,cate_id=%s WHERE goods_id=%s",[int(time.time()),item['ExpiryTime'], item['site_id'],goods_cate_id,item['goods']['goods_id']]) print res._last_executed else: classlist = Classifier.findCateAndTags(item['name'], 4) goods_cate_id = classlist['cate'] res = self.db.execute("INSERT INTO le_goods SET `uid`=%s,`site_id`=%s,`img`=%s, `deal_img`=%s,`display_order`=%s,`desc_bigpic`=%s, `oldimg`=%s, `small_pic`=%s,`desc_oldimg`=%s,`bigpic`=%s, `name`=%s, `seo_title`=%s, `url`=%s, `currency`=%s,`original_price`=%s, `price`=%s, `cate_id`=%s, `source`=%s, `addtime`=%s,`expiry_time`=%s, `uptime`=%s, `website_id`=%s,`isdeal`=%s,`ispublish`=%s,`isshow`=%s,`highlight`=%s, `conditions`=%s, `description`=%s, `merchant`=%s,`phone`=%s, `address`=%s,`city`=%s, `country`=%s, `post`=%s",[1,item['site_id'], img,img,0,'',old_pic,small_pic,'',big_pic,item['name'],get_seo_title(item['name']),item['url'],'SGD',item['originalPrice'],item['price'], goods_cate_id,'reptile',time.time(),item['ExpiryTime'],time.time(),item['website_id'],1,1,1,item['highlight'],item['condition'],item['description'],item['merchant'],item['phone'],item['address'],1,1,item['postCode']]) goods_id = res.lastrowid if classlist['cate'] > 0: self.add_cate_goods_index(classlist['cate'],goods_id) for cate_id in classlist['cates']: self.add_cate_goods_index(cate_id, goods_id) return item
# movie's type type = selector.xpath("//div[@id='info']/span[@property='v:genre']/text()") type = '/'.join(type) # movie's score score = selector.xpath("//div[@id='interest_sectl']//strong[@class='ll rating_num']/text()") score = score[0] if len(score) == 1 else "Unknown" # movie's plot plot = selector.xpath("//div[@class='related-info']//span[@property='v:summary']/text()") plot = '\n'.join(plot) cursor.close() conn.close() return { 'name' : name, 'pic' : pic, 'type' : type, 'score': score, 'plot' : plot, 'link' : movieUrl } if __name__ == '__main__': movie = getData() db = DB() select = db.one("SELECT * FROM movie WHERE date=?", (getToday(),)) if select is None: insertData = (getToday(), movie['name'], movie['pic'], movie['type'], movie['score'], movie['plot'], movie['link']) db.execute("INSERT INTO movie VALUES (?, ?, ?, ?, ?, ?, ?)", insertData)
score = selector.xpath( "//div[@id='interest_sectl']//strong[@class='ll rating_num']/text()") score = score[0] if len(score) == 1 else "Unknown" # movie's plot plot = selector.xpath( "//div[@class='related-info']//span[@property='v:summary']/text()") plot = '\n'.join(plot) cursor.close() conn.close() return { 'name': name, 'pic': pic, 'type': type, 'score': score, 'plot': plot, 'link': movieUrl } if __name__ == '__main__': movie = getData() db = DB() select = db.one("SELECT * FROM movie WHERE date=?", (getToday(), )) if select is None: insertData = (getToday(), movie['name'], movie['pic'], movie['type'], movie['score'], movie['plot'], movie['link']) db.execute("INSERT INTO movie VALUES (?, ?, ?, ?, ?, ?, ?)", insertData)