def get_url(self): main_url = 'https://www.piaohua.com/html/%s/index.html' % self.__ftype ourl = openurl.OpenUrl(main_url) code, main_content = ourl.openurl() if code == 200: #soup = BeautifulSoup(main_content, 'lxml') #b = soup.find_all(text=re.compile("共(\d+)页"))[0] #pages = re.sub('\D', "", str(b.split('页')[0])) selecter = etree.HTML(main_content) pages = int( selecter.xpath('//li[@class="end"]/a')[0].attrib['href'].split( "_")[1].split('.')[0]) else: print("bad url: %s" % main_url) sys.exit(-1) redis_id = 0 for page in range(1, int(pages)): list_url = 'https://www.piaohua.com/html/%s/list_%d.html' % ( self.__ftype, page) sub_ourl = openurl.OpenUrl(list_url) sub_code, sub_content = sub_ourl.openurl() if sub_code == 200: selector = etree.HTML(sub_content) for link in selector.xpath('//span/a'): sub_url = link.attrib['href'] if sub_url.startswith('/html/' + self.__ftype): fkey = self.__ftype + str(redis_id) self.__redis_link.set(fkey, sub_url, ex=21600) redis_id += 1 time.sleep(0.5)
def main(): print("欢迎使用 美剧天堂 爬取脚本") print("=" * 20) print("魔幻/科幻:1\n灵异/惊悚:2\n都市/感情:3\n犯罪/历史:4\n选秀/综艺:5\n动漫/卡通:6") print("=" * 20) ftype = input('请输入需要爬取的类型的代号:') start_url = "http://www.meijutt.com/file/list%s.html" % ftype ourl = openurl.OpenUrl(start_url, 'gb2312') code, doc = ourl.openurl() mylog = Logger( os.path.join(os.path.abspath(os.path.curdir), 'misc/spider_log.yaml')) logger = mylog.outputLog() if code == 200: selecter = etree.HTML(doc) pages = selecter.xpath( "//div[@class='page']/span/text()")[0].split()[0].split('/')[1] firstpage_links = selecter.xpath("//a[@class='B font_14']/@href") for firstpage_link in firstpage_links: name, download_links = get_downlink(firstpage_link) send_mysql(name, download_links, logger) time.sleep(0.5) for page in range(2, int(pages)): page_url = 'http://www.meijutt.com/file/list%s_%s.html' % (ftype, page) for link in page_link(page_url): name, download_links = get_downlink(link) if name != '' and download_links != '': send_mysql(name, download_links, logger) time.sleep(0.5) else: print("[%s] error..." % start_url) print("Done.")
def spiderman(): url = 'https://www.qiushibaike.com/8hr/page/1/' ourl = openurl.OpenUrl(url) code, doc = ourl.openurl() if code == 200: selector = etree.HTML(doc) content = selector.xpath("//div[contains(@id,'qiushi_tag')]") item = [] for site in content: result = {} try: imgUrl = site.xpath('./div/a/img/@src')[0] username = site.xpath('./div/a/img/@alt')[0] content = site.xpath('.//div[@class="content"]/span/text()')[0].strip() vote = site.xpath('.//i/text()')[0] comment = site.xpath('.//i/text()')[1] except: print("something failed..") continue result['imgUrl'] = imgUrl result['username'] = username result['content'] = content result['vote'] = vote result['comment'] = comment item.append(result) return item
def page_link(url): '''获取页面的每个美剧的url信息''' ourl = openurl.OpenUrl(url, 'gb2312') code, doc = ourl.run() if code == 200: selecter = etree.HTML(doc) return selecter.xpath("//a[@class='B font_14']/@href") else: return []
def gethtml(self, url): ''' 获取html文件 返回url的列表 ''' ob_openurl = openurl.OpenUrl(url) code, html = ob_openurl.openurl() if code == 200: return html else: self.logger.error('open [%s] failed..' % url)
def main(): print("欢迎使用 美剧天堂 爬取脚本") pg_conn = pg_client.Mypostgres() for ftype in range(1, 7): start_url = "http://www.meijutt.tv/file/list{}.html".format(ftype) ourl = openurl.OpenUrl(start_url, 'gb2312') code, doc = ourl.run() if code == 200: selecter = etree.HTML(doc) pages = selecter.xpath( "//div[@class='page']/span/text()")[0].split()[0].split('/')[1] firstpage_links = selecter.xpath("//a[@class='B font_14']/@href") for firstpage_link in firstpage_links: name, download_links, status = get_downlink(firstpage_link) print(name, status) if name != 'null': ret_status = get_status(pg_conn, name) if not ret_status: send_pg(pg_conn, [name, download_links, status]) elif status != ret_status: update_pg(pg_conn, [name, download_links, status]) else: pass time.sleep(0.5) for page in range(2, int(pages)): page_url = 'http://www.meijutt.tv/file/list{0}_{1}.html'.format( ftype, page) for link in page_link(page_url): name, download_links, status = get_downlink(link) print(name) if name != 'null': ret_status = get_status(pg_conn, name) if not ret_status: send_pg(pg_conn, [name, download_links, status]) elif status != ret_status: update_pg(pg_conn, [name, download_links, status]) else: pass time.sleep(0.5) else: print("[{}] error...".format(start_url)) print("Done.")
def downurl(allurl, logger): for url in allurl: info = [] phase = re.sub('\D', "", url) ourl = openurl.OpenUrl(url) code, doc = ourl.openurl() if code == 200: selecter = etree.HTML(doc) try: down_link = selecter.xpath( '//div[@class="buttons"]/a/@href')[0] passwd = selecter.xpath('//div[@class="buttons"]/a/text()')[0] except: logger.error('%s get info error...' % phase) continue info.append(phase) info.append(down_link) info.append(passwd) send_mysql(info, logger)
def get_links(year): start_url = 'http://www.runningman-fan.com/category/runningman%s' % year allurl_list = [] for page in range(1, 20): #这块需要根据具体页数进行循环 full_url = start_url + '/page/%d' % page ourl = openurl.OpenUrl(full_url) code, doc = ourl.openurl() time.sleep(0.5) if code == 200: selecter = etree.HTML(doc) url_list = selecter.xpath('//h2[@class="entry-title"]/a/@href') title_list = selecter.xpath('//h2[@class="entry-title"]/a/text()') if not title_list: continue me = dict(zip(title_list, url_list)) for title in title_list: if u'高清中字' in title: allurl_list.append(me[title]) return allurl_list
def get_download_url(self): '''主要函数''' redis_id = 0 while True: fkey = self.__ftype + str(redis_id) line = self.__redis_link.get(fkey) redis_id += 1 if line: #构建url url = 'https://www.piaohua.com' + line.decode() #获取html内容 ourl = openurl.OpenUrl(url) code, content = ourl.openurl() #初始化list list_down = [] #判断是否正确打开 if code == 200: #反爬虫 time.sleep(0.5) #构建soup soup = BeautifulSoup(content, 'lxml') #获取名称 name = soup.title.string.split('_')[0] #获取a标签的href属性,并去除\r,避免后续处理的麻烦 for link in soup.find_all('a'): url = link.get('href') if not url is None and 'ftp' in url: url = ''.join(url.split()) list_down.append(url) else: continue #构建最后的str if list_down != []: str_down = '#'.join(list_down) self.send_mysql(name, str_down) else: self.__logger.error( "[ %s ] can not find dowload link..." % name) else: self.__logger.critical("bad url: [ %s ]" % url) else: break
def get_downlink(url_part): str_down = '' url = 'http://www.meijutt.tv' + url_part ourl = openurl.OpenUrl(url, 'gb2312') code, doc = ourl.run() if code == 200: selecter = etree.HTML(doc) try: name = selecter.xpath("//div[@class='info-title']/h1/text()")[0] links = selecter.xpath( "//input[@name='down_url_list_0']/following-sibling::p/strong/a/@href" ) status = selecter.xpath( '//div[@class="o_r_contact"]/ul/li[1]/font[1]/text()')[0] except Exception as e: print(e) return 'null', 'null', 'null' else: str_down = '#'.join(links) return name, str_down, status else: return 'null', 'null', 'null'
def get_downlink(url_part): str_down = '' url = 'http://www.meijutt.com' + url_part ourl = openurl.OpenUrl(url, 'gb2312') code, doc = ourl.openurl() if code == 200: selecter = etree.HTML(doc) try: name = selecter.xpath("//div[@class='info-title']/h1/text()")[0] links = selecter.xpath( "//input[@name='down_url_list_0']/following-sibling::p/strong/a/@href" ) if not name or not links: name = '' str_down = '' except: name = '' str_down = '' else: str_down = '#'.join(links) return name, str_down else: return '', ''
# pylint: disable=no-member # -*- encoding: utf-8 -*- ''' @File : dytt.py @Time : 2019/07/09 17:05:57 @Author : Kellan Fan @Version : 1.0 @Contact : [email protected] @Desc : None ''' # here put the import lib from misc import openurl from lxml import etree url = 'https://www.dytt8.net/' ourl = openurl.OpenUrl(url, 'gb2312') code, doc = ourl.openurl() if code == 200: selector = etree.HTML(doc) url_list = selector.xpath('//a/@href') for urls in url_list: if urls.startswith('/html'): print(urls)
code = mysql_conn.change_data(cmd) if code == 0: print('[%d] ok' % dic['aid']) else: print('[%d] error,message: [%s]' % (dic['aid'], code)) if __name__ == "__main__": if 'Windows' in platform.platform(): mysql_conn = mysql_connect.MysqlConnect( os.path.join(os.path.abspath(os.path.curdir), 'python\\spider\\misc\\mysql_data.yaml')) elif 'Linux' in platform.platform(): mysql_conn = mysql_connect.MysqlConnect( os.path.join(os.path.abspath(os.path.curdir), 'misc/mysql_data.yaml')) else: pass urls = [ "http://api.bilibili.com/x/web-interface/archive/stat?aid={}".format(i) for i in range(20000, 40000) ] for url in urls: ourl = openurl.OpenUrl(url) code, doc = ourl.openurl() time.sleep(0.5) if code == 200: data = json.loads(doc) if data['code'] == 0: insert_data(data['data'])