class Worker(threading.Thread): def __init__(self, redisDao, mysqlDao): threading.Thread.__init__(self) self._queue = queue self._mysqlDao = mysqlDao def run(self): while True: print (self.getName()) if self._queue.empty(): break ret_json = self._queue.get() ret = simplejson.loads(ret_json) id = ret[0] category_id = ret[1] content_url = ret[2] headers = Headers().getHeaders() print(content_url) try: req = requests.get(content_url, headers=headers, timeout=60) if req.status_code == 200: html = req.content selector = etree.HTML(html) titles = selector.xpath('//*[@class="fl"]/a[1]/text()') nicks = selector.xpath('//*[@class="fl"]/span[1]/a[1]/text()') play_urls = selector.xpath('//*[@class="redBtn"]/a[1]/@href') contents = [] contents_li = selector.xpath('//*[@class="movStaff line_BSld"]/li') for c_li in contents_li: c_temp = c_li.xpath('descendant::text()') if len(c_temp) > 0: contents.append(c_temp) imgs = selector.xpath('//*[@class="imgBAyy db"]/descendant::img[1]/@src') title = play_url = content = img = nick = '' if len(titles) > 0: title = titles[0] if len(nicks) > 0: nick = nicks[0] title = title + ',' + nick if len(play_urls) > 0: play_url = play_urls[0] if len(imgs) > 0: img = imgs[0] content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') if img != '': # 存入content sql = 'insert ignore into m1905_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)' values = (category_id, title, content, play_url, img, content_url, created_at) print(title) self._mysqlDao.executeValues(sql, values) except: self._mysqlDao = MysqlDao() if img != '': # url置1 sql = 'update m1905_url set `status`=1 where `id`=' + str(id) self._mysqlDao.execute(sql)
def getUrl(url, page): url = url + str(page) + '.html' print(url) mysqlDao = MysqlDao() try: n = 1 while True: try: headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=10) break except Exception, e: print Exception, ":", e print('sleep') time.sleep(n * 10) n = n + 1 if req.status_code == 200: html = req.content selector = etree.HTML(html) url_contents = selector.xpath('//div[@class="box3"]/descendant::a/@href') for url_content in url_contents: sql = 'insert ignore into loldytt_url (`category_id`,`url`,`status`,created_at) VALUES (%s,%s,%s,%s)' created_at = time.strftime('%Y-%m-%d %H:%M:%S') values = (category_id, url_content, 0, created_at) print(values) mysqlDao.executeValues(sql, values) mysqlDao.close()
def run(self): mysqlDao = MysqlDao() for one in xrange(0, 1000): v = random.randint(1, 10000) sql = 'insert into yingshi_test (`name`) values (%s)' print(sql) mysqlDao.executeValues(sql, v) mysqlDao.close()
def getUrl(url, category_id): print(url) if url == 'http://www.loldytt.com/Zuixinhanju/chart/26.html': url = 'http://www.loldytt.com/Zuixinhanju/chart/25.html' mysqlDao = MysqlDao() n = 1 while True: try: headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=10) break except Exception, e: print Exception, ":", e print('sleep') time.sleep(n * 10) n = n + 1
from headers import Headers from lxml import etree from mysqlpooldao import MysqlDao reload(sys) sys.setdefaultencoding('utf8') while True: try: url = 'http://top.baidu.com/' headers = Headers.getHeaders() req = requests.get(url, headers=headers, timeout=30) if req.status_code == 200: html = req.content.decode('gb2312', 'ignore') selector = etree.HTML(html) words = selector.xpath( '//div[@id="box-cont"]/descendant::a/@title') for word in words: print(word) mysqlDao = MysqlDao() sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)' created_at = time.strftime('%Y-%m-%d %H:%M:%S') values = (word, 0, 0, created_at) mysqlDao.executeValues(sql, values) mysqlDao.close() else: print('code error') except: pass time.sleep(21600)
req = requests.get(url, headers=headers, timeout=60) if req.status_code == 200: html = req.content selector = etree.HTML(html) content_urls = selector.xpath('//ul[@class="inqList pt18"]/li/a/@href') content_urls.reverse() for content_url in content_urls: content_url = Config.url_main + content_url created_at = time.strftime('%Y-%m-%d %H:%M:%S') sql = 'insert ignore into m1905_url (`category_id`,`url`,`status`,`created_at`) VALUES (%s,%s,%s,%s)' values = (category_id, content_url, 0, created_at) mysqlDao.executeValues(sql, values) if __name__ == '__main__': mysqlDao = MysqlDao() sql = 'select `id`,`url` FROM m1905_category' categorys = mysqlDao.execute(sql) print(categorys) for category in categorys: category_id = category[0] category_url = category[1] last_page = getLastPage(category_url) while True: if last_page < 1: break url = category_url + 'o0d0p%s.html' % (last_page) last_page = last_page - 1 print(url) getContentUrl(url, category_id, mysqlDao) mysqlDao.close()
def run(self): while True: print(self.getName()) ret_json = self._redisDao.lpop('kansogou') if ret_json == None: break ret = simplejson.loads(ret_json) id = ret[0] category_id = ret[1] content_url = ret[2] img = ret[3] headers = Headers().getHeaders() print(content_url) try: req = requests.get(content_url, headers=headers, timeout=30) if req.status_code == 200: html = req.content selector = etree.HTML(html) # 电影 if category_id == 1: titles = selector.xpath( '//*[@class="title txt-overflow"]/a[1]/text()') play_urls = selector.xpath( '//*[@class="title txt-overflow"]/a[1]/@href') contents = selector.xpath( '//*[@class="video-info"]/descendant::text()') # 电视剧 if category_id == 2: titles = selector.xpath('//*[@class="tt-mnc"]/text()') play_urls = selector.xpath( '//*[@class="tt-mnc"]/@href') contents = selector.xpath( '//*[@class="lines"]/descendant::text()') # 综艺 if category_id == 3: titles = selector.xpath( '//*[@class="info"]/h1[1]/a[1]/text()') play_urls = selector.xpath( '//*[@class="info"]/h1[1]/a[1]/@href') contents = selector.xpath( '//*[@class="info"]/descendant::span/descendant::text()' ) # 动漫 if category_id == 4: titles = selector.xpath( '//*[@class="title"]/a[1]/text()') play_urls = selector.xpath( '//*[@class="title"]/a[1]/@href') contents = selector.xpath( '//*[@class="video-info"]/descendant::text()') title = play_url = content = '' if len(titles) > 0: title = titles[0] if len(play_urls) > 0: play_url = Config.url_main + play_urls[0] content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') # 存入content sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)' values = (category_id, title, content, play_url, img, content_url, created_at) print(title) self._mysqlDao.executeValues(sql, values) except: self._mysqlDao = MysqlDao() # url置1 sql = 'update kansogou_url set `status`=1 where `id`=' + str(id) self._mysqlDao.execute(sql)
created_at = time.strftime('%Y-%m-%d %H:%M:%S') # 存入content sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)' values = (category_id, title, content, play_url, img, content_url, created_at) print(title) self._mysqlDao.executeValues(sql, values) except: self._mysqlDao = MysqlDao() # url置1 sql = 'update kansogou_url set `status`=1 where `id`=' + str(id) self._mysqlDao.execute(sql) if __name__ == '__main__': mysqlDao = MysqlDao() redisDao = RedisDao() while True: sql = 'select `id`,`category_id`,`url`,`img` from kansogou_url WHERE `status`=0 limit 0,100' ret = mysqlDao.execute(sql) # 如果取出来为空,程序结束 if len(ret) == 0: break # 将mysql的数据存入redis队列 for r in ret: r_json = simplejson.dumps(r) redisDao.rpush('kansogou', r_json) # 开始多线程 worker_num = 1 threads = [] for x in xrange(0, worker_num):
def run(self): while True: print(self.name) mysqlDao = MysqlDao() sql = 'select * from loldytt_url WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) if len(ret) == 0: mysqlDao.close() """ 不用睡眠直接退出等crontab唤醒 """ print('game over') sys.exit() else: res = ret[0] id = res[0] category_id = res[1] url = res[2] sql = 'update loldytt_url set `status`=2 where `id`=' + str(id) mysqlDao.execute(sql) headers = Headers.getHeaders() n = 0 while n < 5: req = requests.get(url, headers=headers) req.encoding = "gbk" if req.status_code == 200: html = req.text.encode(encoding="utf-8", errors="ignore").decode( "utf-8", errors="ignore") try: selector = etree.HTML(html) except: print 333 titles = selector.xpath( '//div[contains(@class,"lm")]/h1/a/text()') if len(titles) > 0: break n = n + 1 if len(titles) > 0: title = titles[0] else: continue casts = selector.xpath( '//div[contains(@class,"zhuyan")]/ul[1]/li/text()') imgs = selector.xpath( '//div[contains(@class,"haibao")]/a[1]/img/@src') cast = '' img = '' content = '' if len(casts) > 0: cast = casts[0].split(':')[1] if len(imgs) > 0: img = imgs[0] contents = selector.xpath( '//div[@class="neirong"]/descendant::text()') if len(contents) > 0: content = simplejson.dumps(contents) created_at = time.strftime('%Y-%m-%d %H:%M:%S') xunlei_download_keys = selector.xpath( '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/text()' ) xunlei_download_values = selector.xpath( '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/@href' ) bt_download_keys = selector.xpath( '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/text()' ) bt_download_values = selector.xpath( '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/@href' ) magnet_download_keys = selector.xpath( '//a[contains(@href,"magnet")]/text()') magnet_download_values = selector.xpath( '//a[contains(@href,"magnet")]/@href') xunlei_download = [] bt_download = [] magnet_download = [] try: xn = 0 for x in xunlei_download_keys: xunlei_download.append({ xunlei_download_keys[xn]: xunlei_download_values[xn] }) xn = xn + 1 bn = 0 for b in bt_download_keys: bt_download.append( {bt_download_keys[bn]: bt_download_values[bn]}) bn = bn + 1 mn = 0 for m in magnet_download_keys: magnet_download.append({ magnet_download_keys[mn]: magnet_download_values[mn] }) mn = mn + 1 except Exception, e: print Exception, ":", e xunlei_download_json = simplejson.dumps(xunlei_download) bt_download_json = simplejson.dumps(bt_download) magnet_download_json = simplejson.dumps(magnet_download) sql_pattern = 'insert ignore INTO `loldytt_content`(`category_id`, `title`,`cast`,`img`,`xunlei_download`, `bt_download`, `magnet_download`, `content`, `url`,`created_at`) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' sql_values = (category_id, title, cast, img, xunlei_download_json, bt_download_json, magnet_download_json, content, url, created_at) print(title) mysqlDao.executeValues(sql_pattern, sql_values) sql = 'update loldytt_url set `status`=1 where `id`=' + str(id) mysqlDao.execute(sql) mysqlDao.close()
def run(self): while True: print(self.name) mysqlDao = MysqlDao() sql = 'select * from ygdy8_url WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) if (len(ret) > 0): res = ret[0] id = res[0] sql = 'update ygdy8_url set `status`=2 where `id`=' + str(id) mysqlDao.execute(sql) category_id = res[1] url = res[2] sql_values = self.getContent(url, category_id) if sql_values != None: sql_pattern = 'insert ignore INTO `ygdy8_content`(`category_id`,`name`, `content`, `img`,`created_at`, `url`) VALUES( %s, %s, %s, %s, %s, %s)' mysqlDao.executeValues(sql_pattern, sql_values) sql = 'update ygdy8_url set `status`=1 where `id`=' + str( id) mysqlDao.execute(sql) mysqlDao.close() else: mysqlDao.close() break
def run(self): mysqlDao = MysqlDao() while True: print(self.name) sql = 'select * from allsearch_key_word WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) if (len(ret) > 0): res = ret[0] id = res[0] sql = 'update allsearch_key_word set `status`=2 where `id`=' + str( id) mysqlDao.execute(sql) word = res[1] sql_values = self.getSearch(word) for sql_value in sql_values: created_at = time.strftime('%Y-%m-%d %H:%M:%S') values = (sql_value, id, 0, created_at) sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)' mysqlDao.executeValues(sql, values) sql = 'update allsearch_key_word set `status`=2 where `id`=' + str( id) mysqlDao.execute(sql) else: print(self.name + 'sleep') time.sleep(3600) mysqlDao.close()
def run(self): while True: print(self.name) mysqlDao = MysqlDao() sql = 'select * from bttiantang_url WHERE `status`=0 limit 0,1' ret = mysqlDao.execute(sql) res = [] for r in ret: res = r print(res) if len(res) == 0: print('sleep') # sql = 'update yingshi_bttiantang_url set `status`=0 WHERE `status`=2' # database.mysqlExecute(sql) mysqlDao.close() # time.sleep(21600) # continue """ 不用睡眠直接退出等crontab唤醒 """ print('game over') sys.exit() else: id = res[0] url = res[1] sql = 'update bttiantang_url set `status`=2 where `id`=' + str( id) mysqlDao.execute(sql) headers = Headers.getHeaders() n = 0 while n < 5: req = requests.get(url, headers=headers) if req.status_code == 200: html = req.content selector = etree.HTML(html) contents = selector.xpath( '//ul[contains(@class,"moviedteail_list")]') if len(contents) > 0: break n = n + 1 if len(contents) > 0: content = contents[0] else: continue names_chn = selector.xpath( '//div[contains(@class,"moviedteail_tt")]/h1/text()') names_eng = selector.xpath( '//div[contains(@class,"moviedteail_tt")]/span/text()') name_chn = '' name_eng = '' if len(names_chn) > 0: name_chn = names_chn[0] if len(names_eng) > 0: name_eng = names_eng[0] names_nick = content.xpath( 'li[contains(text(),"%s")]/a/text()' % (u'又名')) if len(names_nick) > 0: names_nick_new = ",".join(names_nick) else: names_nick_new = "" imgs = simplejson.dumps( selector.xpath( '//div[contains(@class,"moviedteail_img")]/a/img/@src') ) tags = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'标签')) if len(tags) > 0: tags_new = ",".join(tags) else: tags_new = "" areas = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'地区')) if len(areas) > 0: areas_new = ",".join(areas) else: areas_new = "" years = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'年份')) if len(years) > 0: years_new = ",".join(years) else: years_new = "" directors = content.xpath( 'li[contains(text(),"%s")]/a/text()' % (u'导演')) if len(directors) > 0: directors_new = ",".join(directors) else: directors_new = "" writers = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'编剧')) if len(writers) > 0: writers_new = ",".join(writers) else: writers_new = "" casts = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'主演')) if len(casts) > 0: casts_new = ",".join(casts) else: casts_new = "" imdbs = content.xpath('li[contains(text(),"%s")]/a/text()' % (u'imdb')) if len(imdbs) > 0: imdbs_new = ",".join(imdbs) else: imdbs_new = "" details = self.getDetails( content.xpath('li[contains(text(),"%s")]/a/@href' % (u'详情'))) if len(details) > 0: details_new = details[0] else: details_new = "" created_at = time.strftime('%Y-%m-%d %H:%M:%S') downloads = selector.xpath('//div[contains(@class,"tinfo")]') download = [] for d in downloads: try: dn_text = d.xpath('a[1]/@title')[0] dn_url = d.xpath('a[1]/@href')[0] download.append({dn_text: dn_url}) except: pass download_json = simplejson.dumps(download) sql_pattern = 'insert ignore INTO `bttiantang_content`(`names_chn`, `names_eng`,`names_nick`,`imgs`,`tags`, `areas`, `years`, `directors`, `writers`,`casts`, `imdbs`,`details`, `download`,`created_at`, `url`) VALUES(%s, %s, %s,%s,%s,%s, %s, %s,%s, %s,%s, %s,%s, %s, %s)' sql_values = (name_chn, name_eng, names_nick_new, imgs, tags_new, areas_new, years_new, directors_new, writers_new, casts_new, imdbs_new, details_new, download_json, created_at, url) mysqlDao.executeValues(sql_pattern, sql_values) sql = 'update bttiantang_url set `status`=1 where `id`=' + str( id) mysqlDao.execute(sql) mysqlDao.close()