def us_news_crawler(self, code, date=None): # page = 1 data = [] for page in range(3): time.sleep(3) url = self.us_news_url % (page + 1, code) r = requests.get(url, headers=self.headers) r.encoding = 'gb2312' html = etree.HTML(r.text) news_title = html.xpath('//ul[@class=\"xb_list\"][2]/li/a//text()') news_url = html.xpath('//ul[@class=\"xb_list\"][2]/li/a/@href') news_tmp = html.xpath( '//ul[@class=\"xb_list\"][2]/li/span//text()') for i in range(len(news_title)): ds = {} ds['title'] = news_title[i] ds['code'] = code ds['url'] = news_url[i] ds['publish_date'] = self._get_time( news_tmp[i].split(" | ")[1]) ds['timestamp'] = str_to_timestamp( self._get_time(news_tmp[i].split(" | ")[1])) ds['source'] = news_tmp[i].split(" | ")[0] ds['content'] = self._get_content(news_url[i])[1] ds['site'] = '新浪财经' ds['type'] = 'US' if ds['content'] != '': data.append(ds) # else: # print(ds['title']) # print(ds['url']) return data
def parse_comment_info(self, url): # 爬取直接发表评论的人的相关信息(name,info,time,info_url) res = requests.get(url, headers=self.headers) response = res.json() count = response['data']['count'] html = etree.HTML(response['data']['html']) name = html.xpath( "//div[@class='list_li S_line1 clearfix']/div[@class='WB_face W_fl']/a/img/@alt" ) # 评论人的姓名 info = html.xpath( "//div[@node-type='replywrap']/div[@class='WB_text']/text()" ) # 评论信息 info = "".join(info).replace(" ", "").split("\n") info.pop(0) comment_time = html.xpath( "//div[@class='WB_from S_txt2']/text()") # 评论时间 name_url = html.xpath( "//div[@class='WB_face W_fl']/a/@href") # 评论人的url name_url = ["https:" + i for i in name_url] comment_info_list = [] for i in range(len(name)): item = {} item["name"] = name[i] # 存储评论人的网名 item["comment_info"] = info[i] # 存储评论的信息 item["comment_time"] = comment_time[i] # 存储评论时间 item["comment_url"] = name_url[i] # 存储评论人的相关主页 comment_info_list.append(item) return count, comment_info_list
def hk_news_crawler(self, code, date=None): # page = 1 data = [] for page in range(2): time.sleep(3) url = self.hk_news_url % (page, code) r = requests.get(url, headers=self.headers) r.encoding = 'gb2312' html = etree.HTML(r.text) news_title = html.xpath('//ul[@id=\"js_ggzx\"]/li/a//text()') news_url = html.xpath('//ul[@id=\"js_ggzx\"]/li/a/@href') news_date = html.xpath('//ul[@id=\"js_ggzx\"]/li/span//text()') for i in range(len(news_title)): ds = {} ds['title'] = news_title[i] ds['code'] = code ds['url'] = news_url[i] ds['publish_date'] = news_date[i] ds['timestamp'] = str_to_timestamp(news_date[i]) ds['source'] = self._get_content(news_url[i])[0] ds['content'] = self._get_content(news_url[i])[1] ds['site'] = '新浪财经' ds['type'] = 'HK' if ds['content'] != '': data.append(ds) return data
def news_crawler(self, code, date=None): """ 采集 新浪财经 国内股票新闻数据 :param code: :param date: :return: """ page = 1 data = [] url = self.news_url % (code, page) # print(url) r = requests.get(url, headers=self.headers) r.encoding = 'gb2312' html = etree.HTML(r.text) news_href = html.xpath('//div[@class=\"datelist\"]/ul/a/@href') news_list = html.xpath('//div[@class=\"datelist\"]/ul/a/text()') for i in range(len(news_list)): ds = {} ds['title'] = news_list[i] ds['code'] = code ds['url'] = news_href[i] ds['publish_date'] = self._get_date(news_href[i]) ds['timestamp'] = str_to_timestamp(self._get_date(news_href[i])) ds['source'] = self._get_content(news_href[i])[0] ds['content'] = self._get_content(news_href[i])[1] ds['site'] = '新浪财经' ds['type'] = 'CN' if ds['content'] != '': data.append(ds) return data
def xpath_parser(url, urldata): is_secret_cookie = 'cookie' in urldata.keys( ) and urldata['cookie'] == 'secret_cookie' html = WebHelper.get_html(url, is_secret_cookie) proxylist = [] if not html: return proxylist html = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8')) proxy_data = html.xpath(urldata['pattern']) for proxy in proxy_data: ip = proxy.xpath(urldata['position']['ip'])[0].text.strip() port = proxy.xpath(urldata['position']['port'])[0].text.strip() proxy_type = '' protocol = '' if urldata['position']['type'] != '': proxy_type = proxy.xpath( urldata['position']['type'])[0].text.strip() else: proxy_type = '高匿' if urldata['position']['protocol'] != '': protocol = proxy.xpath( urldata['position']['protocol'])[0].text.strip() else: protocol = 'http' try: proxylist.append({ 'ip': ip, 'port': int(port), 'type': proxy_type, 'protocol': protocol }) except: continue return proxylist
def show_reference2(self, post_id, convert=True): """获取任意串的内容,convert为True的话会转换串内容的格式 该函数是直接通过解析网页获得任意串的内容,目前还有一些问题,sage没法判断 获取成功就返回串的内容,获取失败的话会返回出错信息""" payload = {'id': post_id} r = self.session.get(adao_show_reference2, params=payload, timeout=self.timeout) html = etree.HTML(r.text) post = {'admin': '0', 'email': '', 'ext': '', 'img': '', 'sage': '0'} post_id2 = html.xpath('//div/@data-threads-id')[0] if post_id != post_id2: return "该串不存在" post['id'] = post_id image = html.xpath('/html/body/div/div/div/div/div/a/@href') if image: p = re.compile('/image/.+') post['img'], post['ext'] = os.path.splitext( p.search(image[0]).group()[7:]) post['title'] = html.xpath( '//span[@class="h-threads-info-title"]')[0].text post['name'] = html.xpath( '//span[@class="h-threads-info-email"]')[0].text post['now'] = html.xpath( '//span[@class="h-threads-info-createdat"]')[0].text post['userid'] = html.xpath( '//span[@class="h-threads-info-uid"]')[0].text[3:] thread_id = html.xpath('//div[@class="h-threads-info"]/a/@href')[0] p = re.compile('/t/([0-9]+?)\?r=') post['thread_id'] = p.search(thread_id).group()[3:-3] content = html.xpath('//div[@class="h-threads-content"]')[0] post['content'] = etree.tostring( content, encoding='unicode').strip()[37:-12].strip() if convert: post['content'] = self.convert_content(post['content']) if not post['userid']: post['admin'] = '1' userid = etree.tostring( html.xpath('//span[@class="h-threads-info-uid"]')[0], encoding='unicode') post['userid'] = self.convert_content(userid)[3:] return post
def get_content_list(self, html_str): #提取数据 html = etree.HTML(html_str) print(html) s1 = etree.tostring(html).decode(encoding='utf-8') print(s1) #获取电视剧名称 print("++++++++++++++++++++++++++++") ret1 = html.xpath("//link/@href") print(ret1)
def get_detail_url(url): response = requests.get( url, headers=HEADERS, timeout=120) #print(response.content.decode('gbk')) # text = response.text.encode("utf-8") #拿到数据,,再解码 text = response.content.decode('gbk', "ignore") ## ignore 解决编码问题 html = etree.HTML(text) detail_urls = html.xpath("//table[@class='tbspan']//a/@href") detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls) return detail_urls
def _unit_fetch(self): try: resp = self.s.get( 'http://gzlss.hrssgz.gov.cn/gzlss_web/business/tomain/main.xhtml' ) html = etree.HTML(resp.text) target = html.xpath('/html/body/div[1]/div[3]/span/font[1]') self.result_data.update({'姓名': target[0].text}) self.result_identity.update( {'task_name': self.task_info['task_name']}) except InvalidConditionError as e: raise PreconditionNotSatisfiedError(e)
def getHtml(self, url, xpathsearch = None): try: print url request = urllib2.Request(url) request.add_header('User-Agent', random.choice(BROWSERS)) ul = urllib2.urlopen(request) info = ''.join(ul.read()) if info is None: return "" r = html.xpath(info, xpathsearch) return ''.join(r).strip() except Exception,e: print '[*] %s' % e
def getHtml(self, url, xpathsearch=None): try: print url request = urllib2.Request(url) request.add_header('User-Agent', random.choice(BROWSERS)) ul = urllib2.urlopen(request) info = ''.join(ul.read()) if info is None: return "" r = html.xpath(info, xpathsearch) return ''.join(r).strip() except Exception, e: print '[*] %s' % e
def parse_campsites_reservations(self, response): html_url = ('receives/result_rs_%s_%s.html' % (str(response.meta['PlaceId']), str(response.meta['FacilityId']))) campsite_page = codecs.decode(response.body, 'utf8') campsite_page_dict = json.loads(campsite_page) html_body = campsite_page_dict['d'] html = HtmlResponse(url= html_url, encoding='utf-8', body=html_body) sites = html.xpath('//table/tr[@class="unitdata"]') if len(sites): self.save_reservations_html(html, str(response.meta['PlaceId']), str(response.meta['FacilityId'])) all_reservations = [] for site in sites: reservation_links = site.xpath('//td/@onclick').extract() reservations = self.parse_a_campsite_reservations(reservation_links, response.meta['PlaceId'], response.meta['FacilityId']) all_reservations = all_reservations + reservations while len(all_reservations): yield all_reservations.pop()
def goubanjia(urldata): html = WebHelper.get_html(urldata['urls'][0]) html = etree.HTML(html) proxy_data = html.xpath('//table/tbody/tr[position()>=1]') ipAndPort_xpath = './td[1]' type_xpath = './td[2]' protocol_xpath = './td[3]' proxylist = [] for data in proxy_data: ip = '' port = 0 type = data.xpath('./td[2]/a/text()')[0] protocol = data.xpath('./td[3]/a/text()')[0] ip_port_data = data.xpath(ipAndPort_xpath)[0] for e in ip_port_data: e_style = e.attrib['style'] if 'style' in e.attrib.keys( ) else '' e_class = e.attrib['class'] if 'class' in e.attrib.keys( ) else '' if 'port' in e_class: if e.text: b = e_class.split(' ')[1] c = [] for x in b: c.append(x) d = len(c) f = [] for g in range(0, d): f.append(str('ABCDEFGHIZ'.index(c[g]))) port = int(''.join(f)) >> 0x3 elif 'none' not in e_style or e_style == '': if e.text: ip += e.text proxylist.append({ 'ip': ip, 'port': int(port), 'type': type, 'protocol': protocol }) return proxylist
<li class ="item-1"> <a href="link2.html" > second itme </a> </li> <li class ="item-inactive"> <a href="link3.html" > third itme </a> </li> <li class ="item-1"> <a href="link4.html" > forth itme </a> </li> <li class ="item-0"> <a href="link5.html" > fifth itme </a> </ul> </div> #调用HTML类进行初始化,构造一个Xpath解析对象 #利用etree模块自动修复html html = etree.parse('./text.html',etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) #选取所有节点 result = html.xpath('//*') #选取直接子节点a result = html.xpath('//li/a') #选取父节点,然后获取其属性 result = html.xpath('//a[@href="link4.html"]/../@class') result = html.xpath('//a[@href="link4.html"]/parent::*/@class') #属性过滤 result = html.xpath('//li[@class="item-0"]') #获取文本 result = html.xpath('//li[@class="item-0"]/a/text()') #逐层获取 result = html.xpath('//li[@class="item-0"]//text()') #获取子孙节点内部的所有文本 #获取属性 result = html.xpath('//li/a/@href') #属性多值匹配 ''' text = <li class="li li-first"><a href="link.html">first time</a></li>
def parse_detail_page(url): movie = {} response = requests.get(url, headers=HEADERS) text = response.content.decode( 'gbk', "ignore") #text = response.text.encode("utf-8") html = etree.HTML(text) try: title = html.xpath( "//div[@class='title_all']//font[@color='#07519a']/text()")[0] except: return '' movie['title'] = title Zoome = html.xpath("//div[@id='Zoom']")[0] #return list imgs = Zoome.xpath(".//img/@src") #print(cover) if imgs: ## 如果存在图片则继续执行 cover = imgs[0] # screenshot=imgs[1] movie['cover'] = cover # movie['screenshot']=screenshot not all movie has screenshot ,so discard for this moment def parse_info(info, rule): return info.replace(rule, "").strip() infos = Zoome.xpath(".//text()") for index, info in enumerate(infos): if info.startswith("◎年 代"): info = parse_info(info, "◎年 代") movie['year'] = info if info.startswith("◎国 家"): info = parse_info(info, "◎国 家") movie['country'] = info elif info.startswith("◎译 名"): info = parse_info(info, "◎译 名") movie['name_cn'] = info elif info.startswith("◎片 名"): info = parse_info(info, "◎片 名") movie['name_en'] = info elif info.startswith("◎产 地"): info = parse_info(info, "◎产 地") movie['country'] = info elif info.startswith("◎类 别"): info = parse_info(info, "◎类 别") movie['category'] = info elif info.startswith("◎语 言"): info = parse_info(info, "◎语 言") movie['language'] = info elif info.startswith("◎字 幕"): info = parse_info(info, "◎字 幕") movie['sub_title'] = info elif info.startswith("◎上映日期"): info = parse_info(info, "◎上映日期") movie['release_time'] = info elif info.startswith("◎IMDb评分"): info = parse_info(info, "◎IMDb评分") movie['imdb_score'] = info elif info.startswith("◎豆瓣评分"): info = parse_info(info, "◎豆瓣评分") movie['douban_score'] = info elif info.startswith("◎文件格式"): info = parse_info(info, "◎文件格式") movie['file_format'] = info elif info.startswith("◎视频尺寸"): info = parse_info(info, "◎视频尺寸") movie['ratio'] = info elif info.startswith("◎片 长"): info = parse_info(info, "◎片 长") movie['length'] = info elif info.startswith("◎导 演"): info = parse_info(info, "◎导 演") movie['director'] = info elif info.startswith("◎主 演"): info = parse_info(info, "◎主 演") actors = [info] for x in range(index + 1, len(infos)): actor = infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie['actors'] = actors elif info.startswith("◎简 介"): info = parse_info(info, "◎简 介") profiles = [info] for x in range(index + 1, len(infos)): profile = infos[x].strip() if profile.startswith("【下载地址】"): break profiles.append(profile) movie['profiles'] = profiles try: download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0] except: download_url = '' movie['download_url'] = download_url return movie