Beispiel #1
0
class Worker(threading.Thread):
    def __init__(self, redisDao, mysqlDao):
        threading.Thread.__init__(self)
        self._queue = queue
        self._mysqlDao = mysqlDao

    def run(self):
        while True:
            print (self.getName())
            if self._queue.empty():
                break
            ret_json = self._queue.get()
            ret = simplejson.loads(ret_json)
            id = ret[0]
            category_id = ret[1]
            content_url = ret[2]
            headers = Headers().getHeaders()
            print(content_url)
            try:
                req = requests.get(content_url, headers=headers, timeout=60)
                if req.status_code == 200:
                    html = req.content
                    selector = etree.HTML(html)
                    titles = selector.xpath('//*[@class="fl"]/a[1]/text()')
                    nicks = selector.xpath('//*[@class="fl"]/span[1]/a[1]/text()')
                    play_urls = selector.xpath('//*[@class="redBtn"]/a[1]/@href')
                    contents = []
                    contents_li = selector.xpath('//*[@class="movStaff line_BSld"]/li')
                    for c_li in contents_li:
                        c_temp = c_li.xpath('descendant::text()')
                        if len(c_temp) > 0:
                            contents.append(c_temp)
                    imgs = selector.xpath('//*[@class="imgBAyy db"]/descendant::img[1]/@src')
                    title = play_url = content = img = nick = ''
                    if len(titles) > 0:
                        title = titles[0]
                    if len(nicks) > 0:
                        nick = nicks[0]
                    title = title + ',' + nick
                    if len(play_urls) > 0:
                        play_url = play_urls[0]
                    if len(imgs) > 0:
                        img = imgs[0]
                    content = simplejson.dumps(contents)
                    created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                    if img != '':
                        # 存入content
                        sql = 'insert ignore into m1905_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)'
                        values = (category_id, title, content, play_url, img, content_url, created_at)
                        print(title)
                        self._mysqlDao.executeValues(sql, values)
            except:
                self._mysqlDao = MysqlDao()
            if img != '':
                # url置1
                sql = 'update m1905_url set `status`=1 where `id`=' + str(id)
                self._mysqlDao.execute(sql)
Beispiel #2
0
def getUrl(url, page):
    url = url + str(page) + '.html'
    print(url)
    mysqlDao = MysqlDao()
    try:
        n = 1
        while True:
            try:
                headers = Headers.getHeaders()
                req = requests.get(url, headers=headers, timeout=10)
                break
            except Exception, e:
                print Exception, ":", e
                print('sleep')
                time.sleep(n * 10)
                n = n + 1
        if req.status_code == 200:
            html = req.content
            selector = etree.HTML(html)
            url_contents = selector.xpath('//div[@class="box3"]/descendant::a/@href')
            for url_content in url_contents:
                sql = 'insert ignore into loldytt_url (`category_id`,`url`,`status`,created_at) VALUES (%s,%s,%s,%s)'
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                values = (category_id, url_content, 0, created_at)
                print(values)
                mysqlDao.executeValues(sql, values)
            mysqlDao.close()
Beispiel #3
0
 def run(self):
     mysqlDao = MysqlDao()
     for one in xrange(0, 1000):
         v = random.randint(1, 10000)
         sql = 'insert into yingshi_test (`name`) values (%s)'
         print(sql)
         mysqlDao.executeValues(sql, v)
     mysqlDao.close()
Beispiel #4
0
def getUrl(url, category_id):
    print(url)
    if url == 'http://www.loldytt.com/Zuixinhanju/chart/26.html':
        url = 'http://www.loldytt.com/Zuixinhanju/chart/25.html'
    mysqlDao = MysqlDao()
    n = 1
    while True:
        try:
            headers = Headers.getHeaders()
            req = requests.get(url, headers=headers, timeout=10)
            break
        except Exception, e:
            print Exception, ":", e
            print('sleep')
            time.sleep(n * 10)
            n = n + 1
Beispiel #5
0
from headers import Headers
from lxml import etree
from mysqlpooldao import MysqlDao

reload(sys)
sys.setdefaultencoding('utf8')

while True:
    try:
        url = 'http://top.baidu.com/'
        headers = Headers.getHeaders()
        req = requests.get(url, headers=headers, timeout=30)
        if req.status_code == 200:
            html = req.content.decode('gb2312', 'ignore')
            selector = etree.HTML(html)
            words = selector.xpath(
                '//div[@id="box-cont"]/descendant::a/@title')
            for word in words:
                print(word)
                mysqlDao = MysqlDao()
                sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)'
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                values = (word, 0, 0, created_at)
                mysqlDao.executeValues(sql, values)
                mysqlDao.close()
        else:
            print('code error')
    except:
        pass
    time.sleep(21600)
Beispiel #6
0
    req = requests.get(url, headers=headers, timeout=60)
    if req.status_code == 200:
        html = req.content
        selector = etree.HTML(html)
        content_urls = selector.xpath('//ul[@class="inqList pt18"]/li/a/@href')
        content_urls.reverse()
        for content_url in content_urls:
            content_url = Config.url_main + content_url
            created_at = time.strftime('%Y-%m-%d %H:%M:%S')
            sql = 'insert ignore into m1905_url (`category_id`,`url`,`status`,`created_at`) VALUES (%s,%s,%s,%s)'
            values = (category_id, content_url, 0, created_at)
            mysqlDao.executeValues(sql, values)


if __name__ == '__main__':
    mysqlDao = MysqlDao()
    sql = 'select `id`,`url` FROM m1905_category'
    categorys = mysqlDao.execute(sql)
    print(categorys)
    for category in categorys:
        category_id = category[0]
        category_url = category[1]
        last_page = getLastPage(category_url)
        while True:
            if last_page < 1:
                break
            url = category_url + 'o0d0p%s.html' % (last_page)
            last_page = last_page - 1
            print(url)
            getContentUrl(url, category_id, mysqlDao)
    mysqlDao.close()
 def run(self):
     while True:
         print(self.getName())
         ret_json = self._redisDao.lpop('kansogou')
         if ret_json == None:
             break
         ret = simplejson.loads(ret_json)
         id = ret[0]
         category_id = ret[1]
         content_url = ret[2]
         img = ret[3]
         headers = Headers().getHeaders()
         print(content_url)
         try:
             req = requests.get(content_url, headers=headers, timeout=30)
             if req.status_code == 200:
                 html = req.content
                 selector = etree.HTML(html)
                 # 电影
                 if category_id == 1:
                     titles = selector.xpath(
                         '//*[@class="title txt-overflow"]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="title txt-overflow"]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="video-info"]/descendant::text()')
                 # 电视剧
                 if category_id == 2:
                     titles = selector.xpath('//*[@class="tt-mnc"]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="tt-mnc"]/@href')
                     contents = selector.xpath(
                         '//*[@class="lines"]/descendant::text()')
                 # 综艺
                 if category_id == 3:
                     titles = selector.xpath(
                         '//*[@class="info"]/h1[1]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="info"]/h1[1]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="info"]/descendant::span/descendant::text()'
                     )
                 # 动漫
                 if category_id == 4:
                     titles = selector.xpath(
                         '//*[@class="title"]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="title"]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="video-info"]/descendant::text()')
                 title = play_url = content = ''
                 if len(titles) > 0:
                     title = titles[0]
                 if len(play_urls) > 0:
                     play_url = Config.url_main + play_urls[0]
                 content = simplejson.dumps(contents)
                 created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                 # 存入content
                 sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)'
                 values = (category_id, title, content, play_url, img,
                           content_url, created_at)
                 print(title)
                 self._mysqlDao.executeValues(sql, values)
         except:
             self._mysqlDao = MysqlDao()
         # url置1
         sql = 'update kansogou_url set `status`=1 where `id`=' + str(id)
         self._mysqlDao.execute(sql)
                    created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                    # 存入content
                    sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)'
                    values = (category_id, title, content, play_url, img,
                              content_url, created_at)
                    print(title)
                    self._mysqlDao.executeValues(sql, values)
            except:
                self._mysqlDao = MysqlDao()
            # url置1
            sql = 'update kansogou_url set `status`=1 where `id`=' + str(id)
            self._mysqlDao.execute(sql)


if __name__ == '__main__':
    mysqlDao = MysqlDao()
    redisDao = RedisDao()
    while True:
        sql = 'select `id`,`category_id`,`url`,`img` from kansogou_url WHERE `status`=0 limit 0,100'
        ret = mysqlDao.execute(sql)
        # 如果取出来为空,程序结束
        if len(ret) == 0:
            break
        # 将mysql的数据存入redis队列
        for r in ret:
            r_json = simplejson.dumps(r)
            redisDao.rpush('kansogou', r_json)
        # 开始多线程
        worker_num = 1
        threads = []
        for x in xrange(0, worker_num):
Beispiel #9
0
 def run(self):
     while True:
         print(self.name)
         mysqlDao = MysqlDao()
         sql = 'select * from loldytt_url WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         if len(ret) == 0:
             mysqlDao.close()
             """
             不用睡眠直接退出等crontab唤醒
             """
             print('game over')
             sys.exit()
         else:
             res = ret[0]
             id = res[0]
             category_id = res[1]
             url = res[2]
             sql = 'update loldytt_url set `status`=2 where `id`=' + str(id)
             mysqlDao.execute(sql)
             headers = Headers.getHeaders()
             n = 0
             while n < 5:
                 req = requests.get(url, headers=headers)
                 req.encoding = "gbk"
                 if req.status_code == 200:
                     html = req.text.encode(encoding="utf-8",
                                            errors="ignore").decode(
                                                "utf-8", errors="ignore")
                     try:
                         selector = etree.HTML(html)
                     except:
                         print 333
                     titles = selector.xpath(
                         '//div[contains(@class,"lm")]/h1/a/text()')
                     if len(titles) > 0:
                         break
                 n = n + 1
             if len(titles) > 0:
                 title = titles[0]
             else:
                 continue
             casts = selector.xpath(
                 '//div[contains(@class,"zhuyan")]/ul[1]/li/text()')
             imgs = selector.xpath(
                 '//div[contains(@class,"haibao")]/a[1]/img/@src')
             cast = ''
             img = ''
             content = ''
             if len(casts) > 0:
                 cast = casts[0].split(':')[1]
             if len(imgs) > 0:
                 img = imgs[0]
             contents = selector.xpath(
                 '//div[@class="neirong"]/descendant::text()')
             if len(contents) > 0:
                 content = simplejson.dumps(contents)
             created_at = time.strftime('%Y-%m-%d %H:%M:%S')
             xunlei_download_keys = selector.xpath(
                 '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/text()'
             )
             xunlei_download_values = selector.xpath(
                 '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/@href'
             )
             bt_download_keys = selector.xpath(
                 '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/text()'
             )
             bt_download_values = selector.xpath(
                 '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/@href'
             )
             magnet_download_keys = selector.xpath(
                 '//a[contains(@href,"magnet")]/text()')
             magnet_download_values = selector.xpath(
                 '//a[contains(@href,"magnet")]/@href')
             xunlei_download = []
             bt_download = []
             magnet_download = []
             try:
                 xn = 0
                 for x in xunlei_download_keys:
                     xunlei_download.append({
                         xunlei_download_keys[xn]:
                         xunlei_download_values[xn]
                     })
                     xn = xn + 1
                 bn = 0
                 for b in bt_download_keys:
                     bt_download.append(
                         {bt_download_keys[bn]: bt_download_values[bn]})
                     bn = bn + 1
                 mn = 0
                 for m in magnet_download_keys:
                     magnet_download.append({
                         magnet_download_keys[mn]:
                         magnet_download_values[mn]
                     })
                     mn = mn + 1
             except Exception, e:
                 print Exception, ":", e
             xunlei_download_json = simplejson.dumps(xunlei_download)
             bt_download_json = simplejson.dumps(bt_download)
             magnet_download_json = simplejson.dumps(magnet_download)
             sql_pattern = 'insert ignore INTO `loldytt_content`(`category_id`, `title`,`cast`,`img`,`xunlei_download`, `bt_download`, `magnet_download`, `content`, `url`,`created_at`) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
             sql_values = (category_id, title, cast, img,
                           xunlei_download_json, bt_download_json,
                           magnet_download_json, content, url, created_at)
             print(title)
             mysqlDao.executeValues(sql_pattern, sql_values)
             sql = 'update loldytt_url set `status`=1 where `id`=' + str(id)
             mysqlDao.execute(sql)
             mysqlDao.close()
Beispiel #10
0
 def run(self):
     while True:
         print(self.name)
         mysqlDao = MysqlDao()
         sql = 'select * from ygdy8_url WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         if (len(ret) > 0):
             res = ret[0]
             id = res[0]
             sql = 'update ygdy8_url set `status`=2 where `id`=' + str(id)
             mysqlDao.execute(sql)
             category_id = res[1]
             url = res[2]
             sql_values = self.getContent(url, category_id)
             if sql_values != None:
                 sql_pattern = 'insert ignore INTO `ygdy8_content`(`category_id`,`name`, `content`, `img`,`created_at`, `url`) VALUES( %s, %s, %s, %s, %s, %s)'
                 mysqlDao.executeValues(sql_pattern, sql_values)
                 sql = 'update ygdy8_url set `status`=1 where `id`=' + str(
                     id)
                 mysqlDao.execute(sql)
             mysqlDao.close()
         else:
             mysqlDao.close()
             break
Beispiel #11
0
 def run(self):
     mysqlDao = MysqlDao()
     while True:
         print(self.name)
         sql = 'select * from allsearch_key_word WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         if (len(ret) > 0):
             res = ret[0]
             id = res[0]
             sql = 'update allsearch_key_word set `status`=2 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
             word = res[1]
             sql_values = self.getSearch(word)
             for sql_value in sql_values:
                 created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                 values = (sql_value, id, 0, created_at)
                 sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)'
                 mysqlDao.executeValues(sql, values)
             sql = 'update allsearch_key_word set `status`=2 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
         else:
             print(self.name + 'sleep')
             time.sleep(3600)
     mysqlDao.close()
Beispiel #12
0
 def run(self):
     while True:
         print(self.name)
         mysqlDao = MysqlDao()
         sql = 'select * from bttiantang_url WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         res = []
         for r in ret:
             res = r
         print(res)
         if len(res) == 0:
             print('sleep')
             # sql = 'update yingshi_bttiantang_url set `status`=0 WHERE `status`=2'
             # database.mysqlExecute(sql)
             mysqlDao.close()
             # time.sleep(21600)
             # continue
             """
             不用睡眠直接退出等crontab唤醒
             """
             print('game over')
             sys.exit()
         else:
             id = res[0]
             url = res[1]
             sql = 'update bttiantang_url set `status`=2 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
             headers = Headers.getHeaders()
             n = 0
             while n < 5:
                 req = requests.get(url, headers=headers)
                 if req.status_code == 200:
                     html = req.content
                     selector = etree.HTML(html)
                     contents = selector.xpath(
                         '//ul[contains(@class,"moviedteail_list")]')
                     if len(contents) > 0:
                         break
                 n = n + 1
             if len(contents) > 0:
                 content = contents[0]
             else:
                 continue
             names_chn = selector.xpath(
                 '//div[contains(@class,"moviedteail_tt")]/h1/text()')
             names_eng = selector.xpath(
                 '//div[contains(@class,"moviedteail_tt")]/span/text()')
             name_chn = ''
             name_eng = ''
             if len(names_chn) > 0:
                 name_chn = names_chn[0]
             if len(names_eng) > 0:
                 name_eng = names_eng[0]
             names_nick = content.xpath(
                 'li[contains(text(),"%s")]/a/text()' % (u'又名'))
             if len(names_nick) > 0:
                 names_nick_new = ",".join(names_nick)
             else:
                 names_nick_new = ""
             imgs = simplejson.dumps(
                 selector.xpath(
                     '//div[contains(@class,"moviedteail_img")]/a/img/@src')
             )
             tags = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                  (u'标签'))
             if len(tags) > 0:
                 tags_new = ",".join(tags)
             else:
                 tags_new = ""
             areas = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'地区'))
             if len(areas) > 0:
                 areas_new = ",".join(areas)
             else:
                 areas_new = ""
             years = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'年份'))
             if len(years) > 0:
                 years_new = ",".join(years)
             else:
                 years_new = ""
             directors = content.xpath(
                 'li[contains(text(),"%s")]/a/text()' % (u'导演'))
             if len(directors) > 0:
                 directors_new = ",".join(directors)
             else:
                 directors_new = ""
             writers = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                     (u'编剧'))
             if len(writers) > 0:
                 writers_new = ",".join(writers)
             else:
                 writers_new = ""
             casts = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'主演'))
             if len(casts) > 0:
                 casts_new = ",".join(casts)
             else:
                 casts_new = ""
             imdbs = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'imdb'))
             if len(imdbs) > 0:
                 imdbs_new = ",".join(imdbs)
             else:
                 imdbs_new = ""
             details = self.getDetails(
                 content.xpath('li[contains(text(),"%s")]/a/@href' %
                               (u'详情')))
             if len(details) > 0:
                 details_new = details[0]
             else:
                 details_new = ""
             created_at = time.strftime('%Y-%m-%d %H:%M:%S')
             downloads = selector.xpath('//div[contains(@class,"tinfo")]')
             download = []
             for d in downloads:
                 try:
                     dn_text = d.xpath('a[1]/@title')[0]
                     dn_url = d.xpath('a[1]/@href')[0]
                     download.append({dn_text: dn_url})
                 except:
                     pass
             download_json = simplejson.dumps(download)
             sql_pattern = 'insert ignore INTO `bttiantang_content`(`names_chn`, `names_eng`,`names_nick`,`imgs`,`tags`, `areas`, `years`, `directors`, `writers`,`casts`, `imdbs`,`details`, `download`,`created_at`, `url`) VALUES(%s, %s, %s,%s,%s,%s, %s, %s,%s, %s,%s, %s,%s, %s, %s)'
             sql_values = (name_chn, name_eng, names_nick_new, imgs,
                           tags_new, areas_new, years_new, directors_new,
                           writers_new, casts_new, imdbs_new, details_new,
                           download_json, created_at, url)
             mysqlDao.executeValues(sql_pattern, sql_values)
             sql = 'update bttiantang_url set `status`=1 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
             mysqlDao.close()