コード例 #1
0
ファイル: getloldytturlnew.py プロジェクト: xyyanxin/loldytt
def getUrl(url, page):
    url = url + str(page) + '.html'
    print(url)
    mysqlDao = MysqlDao()
    try:
        n = 1
        while True:
            try:
                headers = Headers.getHeaders()
                req = requests.get(url, headers=headers, timeout=10)
                break
            except Exception, e:
                print Exception, ":", e
                print('sleep')
                time.sleep(n * 10)
                n = n + 1
        if req.status_code == 200:
            html = req.content
            selector = etree.HTML(html)
            url_contents = selector.xpath('//div[@class="box3"]/descendant::a/@href')
            for url_content in url_contents:
                sql = 'insert ignore into loldytt_url (`category_id`,`url`,`status`,created_at) VALUES (%s,%s,%s,%s)'
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                values = (category_id, url_content, 0, created_at)
                print(values)
                mysqlDao.executeValues(sql, values)
            mysqlDao.close()
コード例 #2
0
 def getContent(self, url, category_id):
     headers = Headers.getHeaders()
     sleep_time = 1
     while True:
         try:
             req = requests.get(url, headers=headers, timeout=30)
             break
         except:
             print('sleep10')
             time.sleep(10 * sleep_time)
             sleep_time = sleep_time + 1
     if req.status_code == 200:
         html = req.content.decode('gb2312', 'ignore')
         selector = etree.HTML(html)
         root_path = selector.xpath('//div[contains(@id,"Zoom")]')
         names = selector.xpath(
             '//div[contains(@class,"title_all")]/h1/font/text()')
         if len(root_path) > 0:
             contents = simplejson.dumps(
                 root_path[0].xpath('descendant::text()'))
             imgs = simplejson.dumps(
                 root_path[0].xpath('descendant::img/@src'))
             if len(names) > 0:
                 name = names[0]
             else:
                 name = ''
             print(name)
             created_at = time.strftime('%Y-%m-%d %H:%M:%S')
             sql_values = (category_id, name, contents, imgs, created_at,
                           url)
             return sql_values
     else:
         pass
コード例 #3
0
 def bing(self, word):
     ret = []
     headers = Headers.getHeaders()
     url = 'http://global.bing.com/search?q=' + word
     req = requests.get(url, headers=headers, timeout=30)
     if req.status_code == 200:
         html = req.content
         selector = etree.HTML(html)
         words = selector.xpath(
             '//li[@class="b_ans"]/ul/li/a/descendant::text()')
         ret.extend(words)
     return ret
コード例 #4
0
 def baidu(self, word):
     ret = []
     url = 'http://m.baidu.com/s?word=' + word
     headers = Headers.getHeaders()
     req = requests.get(url, headers=headers, timeout=30)
     if req.status_code == 200:
         html = req.content
         selector = etree.HTML(html)
         words = selector.xpath('//div[@class="rw-list"]/a/text()')
         ret.extend(words)
     print('baidu', ret)
     return ret
コード例 #5
0
ファイル: getloldytturl.py プロジェクト: xyyanxin/loldytt
def getUrlLast(url):
    n = 1
    print(url)
    while True:
        try:
            headers = Headers.getHeaders()
            req = requests.get(url, headers=headers, timeout=10)
            break
        except Exception, e:
            print Exception, ":", e
            print('sleep')
            time.sleep(n * 10)
            n = n + 1
コード例 #6
0
 def so(self, word):
     ret = []
     headers = Headers.getHeaders()
     headers['Referer'] = 'https://www.so.com/'
     url = 'http://www.so.com/s?q=' + word
     req = requests.get(url, headers=headers, timeout=30)
     if req.status_code == 200:
         html = req.content
         selector = etree.HTML(html)
         words = selector.xpath('//div[@id="rs"]/table/tr/th/a/text()')
         ret.extend(words)
     print('so', ret)
     return ret
コード例 #7
0
 def sogou(self, word):
     ret = []
     headers = Headers.getHeaders()
     headers['Referer'] = 'https://www.sogou.com/'
     url = 'http://m.sogou.com/web/searchList.jsp?pg=webSearchList&v=2&keyword=' + word
     req = requests.get(url, headers=headers, timeout=30)
     if req.status_code == 200:
         html = req.content
         selector = etree.HTML(html)
         words = selector.xpath('//div[@class="bc relate"]/a/text()')
         ret.extend(words)
     print('sogou', ret)
     return ret
コード例 #8
0
def getContentUrl(url, category_id, mysqlDao):
    headers = Headers.getHeaders()
    req = requests.get(url, headers=headers, timeout=60)
    if req.status_code == 200:
        html = req.content
        selector = etree.HTML(html)
        content_urls = selector.xpath('//ul[@class="inqList pt18"]/li/a/@href')
        content_urls.reverse()
        for content_url in content_urls:
            content_url = Config.url_main + content_url
            created_at = time.strftime('%Y-%m-%d %H:%M:%S')
            sql = 'insert ignore into m1905_url (`category_id`,`url`,`status`,`created_at`) VALUES (%s,%s,%s,%s)'
            values = (category_id, content_url, 0, created_at)
            mysqlDao.executeValues(sql, values)
コード例 #9
0
def getLastPage(url):
    last_page = 10
    headers = Headers.getHeaders()
    req = requests.get(url, headers=headers, timeout=30)
    if req.status_code == 200:
        html = req.content
        selector = etree.HTML(html)
        movie_count_text = selector.xpath(
            '//div[@class="termsBox"]/div[1]/text()')
        if len(movie_count_text) > 0:
            movie_count = int(
                filter(str.isdigit, movie_count_text[0].encode('utf8')))
            last_page = int(movie_count / 30)
            if movie_count % 30 > 0:
                last_page = last_page + 1
    return last_page
コード例 #10
0
ファイル: getloldytturl.py プロジェクト: xyyanxin/loldytt
def getUrl(url, category_id):
    print(url)
    if url == 'http://www.loldytt.com/Zuixinhanju/chart/26.html':
        url = 'http://www.loldytt.com/Zuixinhanju/chart/25.html'
    mysqlDao = MysqlDao()
    n = 1
    while True:
        try:
            headers = Headers.getHeaders()
            req = requests.get(url, headers=headers, timeout=10)
            break
        except Exception, e:
            print Exception, ":", e
            print('sleep')
            time.sleep(n * 10)
            n = n + 1
コード例 #11
0
def baiduzhidaosearch(keyword, page):
    ret = {
        'code': 1002,
        'msg': 'failure',
        'data': []
    }
    try:
        page = int(page) * 10
        print 111
        keyword_u = keyword.encode('utf-8')
        print chardet.detect(keyword_u)
        # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page)
        url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword)
        #print(url)
        print 222
        headers = Headers.getHeaders()
        proxies = Proxies.get_proxies()
        req = requests.get(url, headers=headers, timeout=60, proxies=proxies)
        if req.status_code == 200:
            ret['code'] = 1001
            ret['msg'] = 'success'
            id = []
            title = []
            req.encoding = 'gbk'
            html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore")
            selector = etree.HTML(html)
            urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href')
            for u in urls:
                match_obj = re.search(r'question/(.*?).html', u, re.M | re.I)
                id.append(match_obj.group(1))
            titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a')
            for t in titles:
                title.append(etree.tostring(t, encoding='utf8', method="html"))
            max_n = len(id)
            n = 0
            while True:
                if n >= max_n:
                    break
                # print(title[n])
                ret['data'].append(
                        {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)})
                n = n + 1
    except Exception as e :
        print e
    return simplejson.dumps(ret)
コード例 #12
0
ファイル: getloldytturlnew.py プロジェクト: xyyanxin/loldytt
def getPageCount(url, first_page):
    num = 0
    headers = Headers.getHeaders()
    try:
        url = url + str(first_page) + '.html'
        req = requests.get(url, headers=headers, timeout=30)
        if req.status_code == 200:
            html = req.content
            selector = etree.HTML(html)
            page_counts = selector.xpath('//div[@class="pagebox"]/span/text()')
            if len(page_counts) > 0:
                page_count = page_counts[0]
                page_count_list = page_count.split(u'部')
                if len(page_count_list) >= 2:
                    aaa = page_count_list[1]
                    bbb = aaa.split(u'页')
                    if len(bbb) > 0:
                        page = bbb[0]
                        if page.isdigit() == True:
                            num = page
    except Exception, e:
        print Exception, ":", e
コード例 #13
0
关键词搜索
'''
import sys
import time
import requests
from headers import Headers
from lxml import etree
from mysqlpooldao import MysqlDao

reload(sys)
sys.setdefaultencoding('utf8')

while True:
    try:
        url = 'http://top.baidu.com/'
        headers = Headers.getHeaders()
        req = requests.get(url, headers=headers, timeout=30)
        if req.status_code == 200:
            html = req.content.decode('gb2312', 'ignore')
            selector = etree.HTML(html)
            words = selector.xpath(
                '//div[@id="box-cont"]/descendant::a/@title')
            for word in words:
                print(word)
                mysqlDao = MysqlDao()
                sql = 'insert ignore into allsearch_key_word (`word`,`parent_id`,`status`,`created_at`) VALUES (%s,%s,%s,%s)'
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                values = (word, 0, 0, created_at)
                mysqlDao.executeValues(sql, values)
                mysqlDao.close()
        else:
コード例 #14
0
ファイル: getloldyttcontent.py プロジェクト: xyyanxin/loldytt
 def run(self):
     while True:
         print(self.name)
         mysqlDao = MysqlDao()
         sql = 'select * from loldytt_url WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         if len(ret) == 0:
             mysqlDao.close()
             """
             不用睡眠直接退出等crontab唤醒
             """
             print('game over')
             sys.exit()
         else:
             res = ret[0]
             id = res[0]
             category_id = res[1]
             url = res[2]
             sql = 'update loldytt_url set `status`=2 where `id`=' + str(id)
             mysqlDao.execute(sql)
             headers = Headers.getHeaders()
             n = 0
             while n < 5:
                 req = requests.get(url, headers=headers)
                 req.encoding = "gbk"
                 if req.status_code == 200:
                     html = req.text.encode(encoding="utf-8",
                                            errors="ignore").decode(
                                                "utf-8", errors="ignore")
                     try:
                         selector = etree.HTML(html)
                     except:
                         print 333
                     titles = selector.xpath(
                         '//div[contains(@class,"lm")]/h1/a/text()')
                     if len(titles) > 0:
                         break
                 n = n + 1
             if len(titles) > 0:
                 title = titles[0]
             else:
                 continue
             casts = selector.xpath(
                 '//div[contains(@class,"zhuyan")]/ul[1]/li/text()')
             imgs = selector.xpath(
                 '//div[contains(@class,"haibao")]/a[1]/img/@src')
             cast = ''
             img = ''
             content = ''
             if len(casts) > 0:
                 cast = casts[0].split(':')[1]
             if len(imgs) > 0:
                 img = imgs[0]
             contents = selector.xpath(
                 '//div[@class="neirong"]/descendant::text()')
             if len(contents) > 0:
                 content = simplejson.dumps(contents)
             created_at = time.strftime('%Y-%m-%d %H:%M:%S')
             xunlei_download_keys = selector.xpath(
                 '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/text()'
             )
             xunlei_download_values = selector.xpath(
                 '//*[contains(@id,"jishu")]/descendant::a[contains(@href,"thunder")]/@href'
             )
             bt_download_keys = selector.xpath(
                 '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/text()'
             )
             bt_download_values = selector.xpath(
                 '//*[contains(@id,"bt")]/descendant::a[contains(@href,"thunder")]/@href'
             )
             magnet_download_keys = selector.xpath(
                 '//a[contains(@href,"magnet")]/text()')
             magnet_download_values = selector.xpath(
                 '//a[contains(@href,"magnet")]/@href')
             xunlei_download = []
             bt_download = []
             magnet_download = []
             try:
                 xn = 0
                 for x in xunlei_download_keys:
                     xunlei_download.append({
                         xunlei_download_keys[xn]:
                         xunlei_download_values[xn]
                     })
                     xn = xn + 1
                 bn = 0
                 for b in bt_download_keys:
                     bt_download.append(
                         {bt_download_keys[bn]: bt_download_values[bn]})
                     bn = bn + 1
                 mn = 0
                 for m in magnet_download_keys:
                     magnet_download.append({
                         magnet_download_keys[mn]:
                         magnet_download_values[mn]
                     })
                     mn = mn + 1
             except Exception, e:
                 print Exception, ":", e
             xunlei_download_json = simplejson.dumps(xunlei_download)
             bt_download_json = simplejson.dumps(bt_download)
             magnet_download_json = simplejson.dumps(magnet_download)
             sql_pattern = 'insert ignore INTO `loldytt_content`(`category_id`, `title`,`cast`,`img`,`xunlei_download`, `bt_download`, `magnet_download`, `content`, `url`,`created_at`) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
             sql_values = (category_id, title, cast, img,
                           xunlei_download_json, bt_download_json,
                           magnet_download_json, content, url, created_at)
             print(title)
             mysqlDao.executeValues(sql_pattern, sql_values)
             sql = 'update loldytt_url set `status`=1 where `id`=' + str(id)
             mysqlDao.execute(sql)
             mysqlDao.close()
コード例 #15
0
 def run(self):
     while True:
         print(self.name)
         mysqlDao = MysqlDao()
         sql = 'select * from bttiantang_url WHERE `status`=0 limit 0,1'
         ret = mysqlDao.execute(sql)
         res = []
         for r in ret:
             res = r
         print(res)
         if len(res) == 0:
             print('sleep')
             # sql = 'update yingshi_bttiantang_url set `status`=0 WHERE `status`=2'
             # database.mysqlExecute(sql)
             mysqlDao.close()
             # time.sleep(21600)
             # continue
             """
             不用睡眠直接退出等crontab唤醒
             """
             print('game over')
             sys.exit()
         else:
             id = res[0]
             url = res[1]
             sql = 'update bttiantang_url set `status`=2 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
             headers = Headers.getHeaders()
             n = 0
             while n < 5:
                 req = requests.get(url, headers=headers)
                 if req.status_code == 200:
                     html = req.content
                     selector = etree.HTML(html)
                     contents = selector.xpath(
                         '//ul[contains(@class,"moviedteail_list")]')
                     if len(contents) > 0:
                         break
                 n = n + 1
             if len(contents) > 0:
                 content = contents[0]
             else:
                 continue
             names_chn = selector.xpath(
                 '//div[contains(@class,"moviedteail_tt")]/h1/text()')
             names_eng = selector.xpath(
                 '//div[contains(@class,"moviedteail_tt")]/span/text()')
             name_chn = ''
             name_eng = ''
             if len(names_chn) > 0:
                 name_chn = names_chn[0]
             if len(names_eng) > 0:
                 name_eng = names_eng[0]
             names_nick = content.xpath(
                 'li[contains(text(),"%s")]/a/text()' % (u'又名'))
             if len(names_nick) > 0:
                 names_nick_new = ",".join(names_nick)
             else:
                 names_nick_new = ""
             imgs = simplejson.dumps(
                 selector.xpath(
                     '//div[contains(@class,"moviedteail_img")]/a/img/@src')
             )
             tags = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                  (u'标签'))
             if len(tags) > 0:
                 tags_new = ",".join(tags)
             else:
                 tags_new = ""
             areas = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'地区'))
             if len(areas) > 0:
                 areas_new = ",".join(areas)
             else:
                 areas_new = ""
             years = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'年份'))
             if len(years) > 0:
                 years_new = ",".join(years)
             else:
                 years_new = ""
             directors = content.xpath(
                 'li[contains(text(),"%s")]/a/text()' % (u'导演'))
             if len(directors) > 0:
                 directors_new = ",".join(directors)
             else:
                 directors_new = ""
             writers = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                     (u'编剧'))
             if len(writers) > 0:
                 writers_new = ",".join(writers)
             else:
                 writers_new = ""
             casts = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'主演'))
             if len(casts) > 0:
                 casts_new = ",".join(casts)
             else:
                 casts_new = ""
             imdbs = content.xpath('li[contains(text(),"%s")]/a/text()' %
                                   (u'imdb'))
             if len(imdbs) > 0:
                 imdbs_new = ",".join(imdbs)
             else:
                 imdbs_new = ""
             details = self.getDetails(
                 content.xpath('li[contains(text(),"%s")]/a/@href' %
                               (u'详情')))
             if len(details) > 0:
                 details_new = details[0]
             else:
                 details_new = ""
             created_at = time.strftime('%Y-%m-%d %H:%M:%S')
             downloads = selector.xpath('//div[contains(@class,"tinfo")]')
             download = []
             for d in downloads:
                 try:
                     dn_text = d.xpath('a[1]/@title')[0]
                     dn_url = d.xpath('a[1]/@href')[0]
                     download.append({dn_text: dn_url})
                 except:
                     pass
             download_json = simplejson.dumps(download)
             sql_pattern = 'insert ignore INTO `bttiantang_content`(`names_chn`, `names_eng`,`names_nick`,`imgs`,`tags`, `areas`, `years`, `directors`, `writers`,`casts`, `imdbs`,`details`, `download`,`created_at`, `url`) VALUES(%s, %s, %s,%s,%s,%s, %s, %s,%s, %s,%s, %s,%s, %s, %s)'
             sql_values = (name_chn, name_eng, names_nick_new, imgs,
                           tags_new, areas_new, years_new, directors_new,
                           writers_new, casts_new, imdbs_new, details_new,
                           download_json, created_at, url)
             mysqlDao.executeValues(sql_pattern, sql_values)
             sql = 'update bttiantang_url set `status`=1 where `id`=' + str(
                 id)
             mysqlDao.execute(sql)
             mysqlDao.close()