Exemple #1
0
 def get_url(self):
     main_url = 'https://www.piaohua.com/html/%s/index.html' % self.__ftype
     ourl = openurl.OpenUrl(main_url)
     code, main_content = ourl.openurl()
     if code == 200:
         #soup = BeautifulSoup(main_content, 'lxml')
         #b = soup.find_all(text=re.compile("共(\d+)页"))[0]
         #pages = re.sub('\D', "", str(b.split('页')[0]))
         selecter = etree.HTML(main_content)
         pages = int(
             selecter.xpath('//li[@class="end"]/a')[0].attrib['href'].split(
                 "_")[1].split('.')[0])
     else:
         print("bad url: %s" % main_url)
         sys.exit(-1)
     redis_id = 0
     for page in range(1, int(pages)):
         list_url = 'https://www.piaohua.com/html/%s/list_%d.html' % (
             self.__ftype, page)
         sub_ourl = openurl.OpenUrl(list_url)
         sub_code, sub_content = sub_ourl.openurl()
         if sub_code == 200:
             selector = etree.HTML(sub_content)
             for link in selector.xpath('//span/a'):
                 sub_url = link.attrib['href']
                 if sub_url.startswith('/html/' + self.__ftype):
                     fkey = self.__ftype + str(redis_id)
                     self.__redis_link.set(fkey, sub_url, ex=21600)
                     redis_id += 1
         time.sleep(0.5)
Exemple #2
0
def main():
    print("欢迎使用 美剧天堂 爬取脚本")
    print("=" * 20)
    print("魔幻/科幻:1\n灵异/惊悚:2\n都市/感情:3\n犯罪/历史:4\n选秀/综艺:5\n动漫/卡通:6")
    print("=" * 20)
    ftype = input('请输入需要爬取的类型的代号:')
    start_url = "http://www.meijutt.com/file/list%s.html" % ftype
    ourl = openurl.OpenUrl(start_url, 'gb2312')
    code, doc = ourl.openurl()
    mylog = Logger(
        os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml'))
    logger = mylog.outputLog()
    if code == 200:
        selecter = etree.HTML(doc)
        pages = selecter.xpath(
            "//div[@class='page']/span/text()")[0].split()[0].split('/')[1]
        firstpage_links = selecter.xpath("//a[@class='B font_14']/@href")
        for firstpage_link in firstpage_links:
            name, download_links = get_downlink(firstpage_link)
            send_mysql(name, download_links, logger)
            time.sleep(0.5)

        for page in range(2, int(pages)):
            page_url = 'http://www.meijutt.com/file/list%s_%s.html' % (ftype,
                                                                       page)
            for link in page_link(page_url):
                name, download_links = get_downlink(link)
                if name != '' and download_links != '':
                    send_mysql(name, download_links, logger)
                    time.sleep(0.5)
    else:
        print("[%s] error..." % start_url)

    print("Done.")
Exemple #3
0
def spiderman():
    url = 'https://www.qiushibaike.com/8hr/page/1/'
    ourl = openurl.OpenUrl(url)
    code, doc = ourl.openurl()
    if code == 200:
        selector = etree.HTML(doc)
        content = selector.xpath("//div[contains(@id,'qiushi_tag')]")
        item = []
        for site in content:
            result = {}
            try:
                imgUrl = site.xpath('./div/a/img/@src')[0]
                username = site.xpath('./div/a/img/@alt')[0]
                content = site.xpath('.//div[@class="content"]/span/text()')[0].strip()
                vote = site.xpath('.//i/text()')[0]
                comment = site.xpath('.//i/text()')[1]
            except:
                print("something failed..")
                continue

            result['imgUrl'] = imgUrl
            result['username'] = username
            result['content'] = content
            result['vote'] = vote
            result['comment'] = comment
            
            item.append(result)
    
    return item
Exemple #4
0
def page_link(url):
    '''获取页面的每个美剧的url信息'''
    ourl = openurl.OpenUrl(url, 'gb2312')
    code, doc = ourl.run()
    if code == 200:
        selecter = etree.HTML(doc)
        return selecter.xpath("//a[@class='B font_14']/@href")
    else:
        return []
Exemple #5
0
 def gethtml(self, url):
     '''
        获取html文件
        返回url的列表
     '''
     ob_openurl = openurl.OpenUrl(url)
     code, html = ob_openurl.openurl()
     if code == 200:
         return html
     else:
         self.logger.error('open [%s] failed..' % url)
Exemple #6
0
def main():
    print("欢迎使用 美剧天堂 爬取脚本")
    pg_conn = pg_client.Mypostgres()
    for ftype in range(1, 7):
        start_url = "http://www.meijutt.tv/file/list{}.html".format(ftype)
        ourl = openurl.OpenUrl(start_url, 'gb2312')
        code, doc = ourl.run()
        if code == 200:
            selecter = etree.HTML(doc)
            pages = selecter.xpath(
                "//div[@class='page']/span/text()")[0].split()[0].split('/')[1]
            firstpage_links = selecter.xpath("//a[@class='B font_14']/@href")
            for firstpage_link in firstpage_links:
                name, download_links, status = get_downlink(firstpage_link)
                print(name, status)
                if name != 'null':
                    ret_status = get_status(pg_conn, name)
                    if not ret_status:
                        send_pg(pg_conn, [name, download_links, status])
                    elif status != ret_status:
                        update_pg(pg_conn, [name, download_links, status])
                    else:
                        pass

                time.sleep(0.5)

            for page in range(2, int(pages)):
                page_url = 'http://www.meijutt.tv/file/list{0}_{1}.html'.format(
                    ftype, page)
                for link in page_link(page_url):
                    name, download_links, status = get_downlink(link)
                    print(name)
                    if name != 'null':
                        ret_status = get_status(pg_conn, name)
                        if not ret_status:
                            send_pg(pg_conn, [name, download_links, status])
                        elif status != ret_status:
                            update_pg(pg_conn, [name, download_links, status])
                        else:
                            pass
                    time.sleep(0.5)
        else:
            print("[{}] error...".format(start_url))

    print("Done.")
Exemple #7
0
def downurl(allurl, logger):
    for url in allurl:
        info = []
        phase = re.sub('\D', "", url)
        ourl = openurl.OpenUrl(url)
        code, doc = ourl.openurl()
        if code == 200:
            selecter = etree.HTML(doc)
            try:
                down_link = selecter.xpath(
                    '//div[@class="buttons"]/a/@href')[0]
                passwd = selecter.xpath('//div[@class="buttons"]/a/text()')[0]
            except:
                logger.error('%s get info error...' % phase)
                continue
        info.append(phase)
        info.append(down_link)
        info.append(passwd)
        send_mysql(info, logger)
Exemple #8
0
def get_links(year):
    start_url = 'http://www.runningman-fan.com/category/runningman%s' % year
    allurl_list = []
    for page in range(1, 20):  #这块需要根据具体页数进行循环
        full_url = start_url + '/page/%d' % page
        ourl = openurl.OpenUrl(full_url)
        code, doc = ourl.openurl()
        time.sleep(0.5)
        if code == 200:
            selecter = etree.HTML(doc)
            url_list = selecter.xpath('//h2[@class="entry-title"]/a/@href')
            title_list = selecter.xpath('//h2[@class="entry-title"]/a/text()')
            if not title_list:
                continue
            me = dict(zip(title_list, url_list))
            for title in title_list:
                if u'高清中字' in title:
                    allurl_list.append(me[title])
    return allurl_list
Exemple #9
0
 def get_download_url(self):
     '''主要函数'''
     redis_id = 0
     while True:
         fkey = self.__ftype + str(redis_id)
         line = self.__redis_link.get(fkey)
         redis_id += 1
         if line:
             #构建url
             url = 'https://www.piaohua.com' + line.decode()
             #获取html内容
             ourl = openurl.OpenUrl(url)
             code, content = ourl.openurl()
             #初始化list
             list_down = []
             #判断是否正确打开
             if code == 200:
                 #反爬虫
                 time.sleep(0.5)
                 #构建soup
                 soup = BeautifulSoup(content, 'lxml')
                 #获取名称
                 name = soup.title.string.split('_')[0]
                 #获取a标签的href属性,并去除\r,避免后续处理的麻烦
                 for link in soup.find_all('a'):
                     url = link.get('href')
                     if not url is None and 'ftp' in url:
                         url = ''.join(url.split())
                         list_down.append(url)
                     else:
                         continue
                 #构建最后的str
                 if list_down != []:
                     str_down = '#'.join(list_down)
                     self.send_mysql(name, str_down)
                 else:
                     self.__logger.error(
                         "[ %s ] can not find dowload link..." % name)
             else:
                 self.__logger.critical("bad url: [ %s ]" % url)
         else:
             break
Exemple #10
0
def get_downlink(url_part):
    str_down = ''
    url = 'http://www.meijutt.tv' + url_part
    ourl = openurl.OpenUrl(url, 'gb2312')
    code, doc = ourl.run()
    if code == 200:
        selecter = etree.HTML(doc)
        try:
            name = selecter.xpath("//div[@class='info-title']/h1/text()")[0]
            links = selecter.xpath(
                "//input[@name='down_url_list_0']/following-sibling::p/strong/a/@href"
            )
            status = selecter.xpath(
                '//div[@class="o_r_contact"]/ul/li[1]/font[1]/text()')[0]
        except Exception as e:
            print(e)
            return 'null', 'null', 'null'
        else:
            str_down = '#'.join(links)
        return name, str_down, status
    else:
        return 'null', 'null', 'null'
Exemple #11
0
def get_downlink(url_part):
    str_down = ''
    url = 'http://www.meijutt.com' + url_part
    ourl = openurl.OpenUrl(url, 'gb2312')
    code, doc = ourl.openurl()
    if code == 200:
        selecter = etree.HTML(doc)
        try:
            name = selecter.xpath("//div[@class='info-title']/h1/text()")[0]
            links = selecter.xpath(
                "//input[@name='down_url_list_0']/following-sibling::p/strong/a/@href"
            )
            if not name or not links:
                name = ''
                str_down = ''
        except:
            name = ''
            str_down = ''
        else:
            str_down = '#'.join(links)

        return name, str_down
    else:
        return '', ''
Exemple #12
0
# pylint: disable=no-member
# -*- encoding: utf-8 -*-
'''
@File    :   dytt.py
@Time    :   2019/07/09 17:05:57
@Author  :   Kellan Fan 
@Version :   1.0
@Contact :   [email protected]
@Desc    :   None
'''

# here put the import lib
from misc import openurl
from lxml import etree

url = 'https://www.dytt8.net/'
ourl = openurl.OpenUrl(url, 'gb2312')
code, doc = ourl.openurl()
if code == 200:
    selector = etree.HTML(doc)
    url_list = selector.xpath('//a/@href')
    for urls in url_list:
        if urls.startswith('/html'):
            print(urls)
Exemple #13
0
    code = mysql_conn.change_data(cmd)
    if code == 0:
        print('[%d] ok' % dic['aid'])
    else:
        print('[%d] error,message: [%s]' % (dic['aid'], code))


if __name__ == "__main__":
    if 'Windows' in platform.platform():
        mysql_conn = mysql_connect.MysqlConnect(
            os.path.join(os.path.abspath(os.path.curdir),
                         'python\\spider\\misc\\mysql_data.yaml'))
    elif 'Linux' in platform.platform():
        mysql_conn = mysql_connect.MysqlConnect(
            os.path.join(os.path.abspath(os.path.curdir),
                         'misc/mysql_data.yaml'))
    else:
        pass
    urls = [
        "http://api.bilibili.com/x/web-interface/archive/stat?aid={}".format(i)
        for i in range(20000, 40000)
    ]
    for url in urls:
        ourl = openurl.OpenUrl(url)
        code, doc = ourl.openurl()
        time.sleep(0.5)
        if code == 200:
            data = json.loads(doc)
            if data['code'] == 0:
                insert_data(data['data'])