def us_news_crawler(self, code, date=None):
        # page = 1
        data = []
        for page in range(3):
            time.sleep(3)
            url = self.us_news_url % (page + 1, code)
            r = requests.get(url, headers=self.headers)
            r.encoding = 'gb2312'
            html = etree.HTML(r.text)
            news_title = html.xpath('//ul[@class=\"xb_list\"][2]/li/a//text()')
            news_url = html.xpath('//ul[@class=\"xb_list\"][2]/li/a/@href')
            news_tmp = html.xpath(
                '//ul[@class=\"xb_list\"][2]/li/span//text()')
            for i in range(len(news_title)):
                ds = {}
                ds['title'] = news_title[i]
                ds['code'] = code
                ds['url'] = news_url[i]
                ds['publish_date'] = self._get_time(
                    news_tmp[i].split(" | ")[1])
                ds['timestamp'] = str_to_timestamp(
                    self._get_time(news_tmp[i].split(" | ")[1]))
                ds['source'] = news_tmp[i].split(" | ")[0]
                ds['content'] = self._get_content(news_url[i])[1]
                ds['site'] = '新浪财经'
                ds['type'] = 'US'
                if ds['content'] != '':
                    data.append(ds)
                # else:
                #     print(ds['title'])
                #     print(ds['url'])

        return data
Exemple #2
0
 def parse_comment_info(self,
                        url):  # 爬取直接发表评论的人的相关信息(name,info,time,info_url)
     res = requests.get(url, headers=self.headers)
     response = res.json()
     count = response['data']['count']
     html = etree.HTML(response['data']['html'])
     name = html.xpath(
         "//div[@class='list_li S_line1 clearfix']/div[@class='WB_face W_fl']/a/img/@alt"
     )  # 评论人的姓名
     info = html.xpath(
         "//div[@node-type='replywrap']/div[@class='WB_text']/text()"
     )  # 评论信息
     info = "".join(info).replace(" ", "").split("\n")
     info.pop(0)
     comment_time = html.xpath(
         "//div[@class='WB_from S_txt2']/text()")  # 评论时间
     name_url = html.xpath(
         "//div[@class='WB_face W_fl']/a/@href")  # 评论人的url
     name_url = ["https:" + i for i in name_url]
     comment_info_list = []
     for i in range(len(name)):
         item = {}
         item["name"] = name[i]  # 存储评论人的网名
         item["comment_info"] = info[i]  # 存储评论的信息
         item["comment_time"] = comment_time[i]  # 存储评论时间
         item["comment_url"] = name_url[i]  # 存储评论人的相关主页
         comment_info_list.append(item)
     return count, comment_info_list
 def hk_news_crawler(self, code, date=None):
     # page = 1
     data = []
     for page in range(2):
         time.sleep(3)
         url = self.hk_news_url % (page, code)
         r = requests.get(url, headers=self.headers)
         r.encoding = 'gb2312'
         html = etree.HTML(r.text)
         news_title = html.xpath('//ul[@id=\"js_ggzx\"]/li/a//text()')
         news_url = html.xpath('//ul[@id=\"js_ggzx\"]/li/a/@href')
         news_date = html.xpath('//ul[@id=\"js_ggzx\"]/li/span//text()')
         for i in range(len(news_title)):
             ds = {}
             ds['title'] = news_title[i]
             ds['code'] = code
             ds['url'] = news_url[i]
             ds['publish_date'] = news_date[i]
             ds['timestamp'] = str_to_timestamp(news_date[i])
             ds['source'] = self._get_content(news_url[i])[0]
             ds['content'] = self._get_content(news_url[i])[1]
             ds['site'] = '新浪财经'
             ds['type'] = 'HK'
             if ds['content'] != '':
                 data.append(ds)
     return data
    def news_crawler(self, code, date=None):
        """
        采集 新浪财经 国内股票新闻数据
        :param code:
        :param date:
        :return:
        """
        page = 1
        data = []

        url = self.news_url % (code, page)
        # print(url)
        r = requests.get(url, headers=self.headers)
        r.encoding = 'gb2312'
        html = etree.HTML(r.text)
        news_href = html.xpath('//div[@class=\"datelist\"]/ul/a/@href')
        news_list = html.xpath('//div[@class=\"datelist\"]/ul/a/text()')

        for i in range(len(news_list)):
            ds = {}
            ds['title'] = news_list[i]
            ds['code'] = code
            ds['url'] = news_href[i]
            ds['publish_date'] = self._get_date(news_href[i])
            ds['timestamp'] = str_to_timestamp(self._get_date(news_href[i]))
            ds['source'] = self._get_content(news_href[i])[0]
            ds['content'] = self._get_content(news_href[i])[1]
            ds['site'] = '新浪财经'
            ds['type'] = 'CN'
            if ds['content'] != '':
                data.append(ds)
        return data
Exemple #5
0
    def xpath_parser(url, urldata):
        is_secret_cookie = 'cookie' in urldata.keys(
        ) and urldata['cookie'] == 'secret_cookie'
        html = WebHelper.get_html(url, is_secret_cookie)
        proxylist = []
        if not html:
            return proxylist
        html = etree.HTML(html, parser=etree.HTMLParser(encoding='utf-8'))
        proxy_data = html.xpath(urldata['pattern'])

        for proxy in proxy_data:
            ip = proxy.xpath(urldata['position']['ip'])[0].text.strip()
            port = proxy.xpath(urldata['position']['port'])[0].text.strip()
            proxy_type = ''
            protocol = ''
            if urldata['position']['type'] != '':
                proxy_type = proxy.xpath(
                    urldata['position']['type'])[0].text.strip()
            else:
                proxy_type = '高匿'
            if urldata['position']['protocol'] != '':
                protocol = proxy.xpath(
                    urldata['position']['protocol'])[0].text.strip()
            else:
                protocol = 'http'
            try:
                proxylist.append({
                    'ip': ip,
                    'port': int(port),
                    'type': proxy_type,
                    'protocol': protocol
                })
            except:
                continue
        return proxylist
Exemple #6
0
    def show_reference2(self, post_id, convert=True):
        """获取任意串的内容,convert为True的话会转换串内容的格式
        该函数是直接通过解析网页获得任意串的内容,目前还有一些问题,sage没法判断
        获取成功就返回串的内容,获取失败的话会返回出错信息"""
        payload = {'id': post_id}
        r = self.session.get(adao_show_reference2,
                             params=payload,
                             timeout=self.timeout)
        html = etree.HTML(r.text)
        post = {'admin': '0', 'email': '', 'ext': '', 'img': '', 'sage': '0'}

        post_id2 = html.xpath('//div/@data-threads-id')[0]
        if post_id != post_id2:
            return "该串不存在"

        post['id'] = post_id

        image = html.xpath('/html/body/div/div/div/div/div/a/@href')
        if image:
            p = re.compile('/image/.+')
            post['img'], post['ext'] = os.path.splitext(
                p.search(image[0]).group()[7:])

        post['title'] = html.xpath(
            '//span[@class="h-threads-info-title"]')[0].text
        post['name'] = html.xpath(
            '//span[@class="h-threads-info-email"]')[0].text
        post['now'] = html.xpath(
            '//span[@class="h-threads-info-createdat"]')[0].text
        post['userid'] = html.xpath(
            '//span[@class="h-threads-info-uid"]')[0].text[3:]

        thread_id = html.xpath('//div[@class="h-threads-info"]/a/@href')[0]
        p = re.compile('/t/([0-9]+?)\?r=')
        post['thread_id'] = p.search(thread_id).group()[3:-3]

        content = html.xpath('//div[@class="h-threads-content"]')[0]
        post['content'] = etree.tostring(
            content, encoding='unicode').strip()[37:-12].strip()

        if convert:
            post['content'] = self.convert_content(post['content'])

        if not post['userid']:
            post['admin'] = '1'
            userid = etree.tostring(
                html.xpath('//span[@class="h-threads-info-uid"]')[0],
                encoding='unicode')
            post['userid'] = self.convert_content(userid)[3:]

        return post
Exemple #7
0
 def get_content_list(self, html_str):  #提取数据
     html = etree.HTML(html_str)
     print(html)
     s1 = etree.tostring(html).decode(encoding='utf-8')
     print(s1)
     #获取电视剧名称
     print("++++++++++++++++++++++++++++")
     ret1 = html.xpath("//link/@href")
     print(ret1)
Exemple #8
0
def get_detail_url(url):
    response = requests.get(
        url, headers=HEADERS,
        timeout=120)  #print(response.content.decode('gbk'))
    # text = response.text.encode("utf-8")  #拿到数据,,再解码
    text = response.content.decode('gbk', "ignore")  ##  ignore  解决编码问题
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
    return detail_urls
 def _unit_fetch(self):
     try:
         resp = self.s.get(
             'http://gzlss.hrssgz.gov.cn/gzlss_web/business/tomain/main.xhtml'
         )
         html = etree.HTML(resp.text)
         target = html.xpath('/html/body/div[1]/div[3]/span/font[1]')
         self.result_data.update({'姓名': target[0].text})
         self.result_identity.update(
             {'task_name': self.task_info['task_name']})
     except InvalidConditionError as e:
         raise PreconditionNotSatisfiedError(e)
Exemple #10
0
    def getHtml(self, url, xpathsearch = None):
        try:
            print url
            request = urllib2.Request(url)
            request.add_header('User-Agent', random.choice(BROWSERS))
            ul = urllib2.urlopen(request)
            info = ''.join(ul.read())
            if info is None:
                return ""

            r = html.xpath(info, xpathsearch)
            return ''.join(r).strip()
        except Exception,e:
            print '[*] %s' % e  
Exemple #11
0
    def getHtml(self, url, xpathsearch=None):
        try:
            print url
            request = urllib2.Request(url)
            request.add_header('User-Agent', random.choice(BROWSERS))
            ul = urllib2.urlopen(request)
            info = ''.join(ul.read())
            if info is None:
                return ""

            r = html.xpath(info, xpathsearch)
            return ''.join(r).strip()
        except Exception, e:
            print '[*] %s' % e
    def parse_campsites_reservations(self, response):
        html_url = ('receives/result_rs_%s_%s.html' % (str(response.meta['PlaceId']), str(response.meta['FacilityId'])))
        campsite_page = codecs.decode(response.body, 'utf8')
        campsite_page_dict = json.loads(campsite_page)
        html_body = campsite_page_dict['d']
        html = HtmlResponse(url= html_url, encoding='utf-8', body=html_body)
        sites = html.xpath('//table/tr[@class="unitdata"]')
        if len(sites):
            self.save_reservations_html(html, str(response.meta['PlaceId']), str(response.meta['FacilityId']))
        all_reservations = []
        for site in sites:
            reservation_links = site.xpath('//td/@onclick').extract()
            reservations = self.parse_a_campsite_reservations(reservation_links, response.meta['PlaceId'], response.meta['FacilityId'])
            all_reservations = all_reservations + reservations

        while len(all_reservations):
            yield all_reservations.pop()
Exemple #13
0
    def goubanjia(urldata):
        html = WebHelper.get_html(urldata['urls'][0])
        html = etree.HTML(html)
        proxy_data = html.xpath('//table/tbody/tr[position()>=1]')
        ipAndPort_xpath = './td[1]'
        type_xpath = './td[2]'
        protocol_xpath = './td[3]'
        proxylist = []
        for data in proxy_data:
            ip = ''
            port = 0
            type = data.xpath('./td[2]/a/text()')[0]
            protocol = data.xpath('./td[3]/a/text()')[0]

            ip_port_data = data.xpath(ipAndPort_xpath)[0]
            for e in ip_port_data:
                e_style = e.attrib['style'] if 'style' in e.attrib.keys(
                ) else ''
                e_class = e.attrib['class'] if 'class' in e.attrib.keys(
                ) else ''

                if 'port' in e_class:
                    if e.text:
                        b = e_class.split(' ')[1]
                        c = []
                        for x in b:
                            c.append(x)
                        d = len(c)
                        f = []
                        for g in range(0, d):
                            f.append(str('ABCDEFGHIZ'.index(c[g])))
                        port = int(''.join(f)) >> 0x3
                elif 'none' not in e_style or e_style == '':
                    if e.text:
                        ip += e.text

            proxylist.append({
                'ip': ip,
                'port': int(port),
                'type': type,
                'protocol': protocol
            })

        return proxylist
Exemple #14
0
<li class ="item-1"> <a href="link2.html" > second itme </a> </li>
<li class ="item-inactive"> <a href="link3.html" > third itme </a> </li>
<li class ="item-1"> <a href="link4.html" > forth itme </a> </li>
<li class ="item-0"> <a href="link5.html" > fifth itme </a>
</ul>
</div>

#调用HTML类进行初始化,构造一个Xpath解析对象
#利用etree模块自动修复html
html = etree.parse('./text.html',etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

#选取所有节点
result = html.xpath('//*')
#选取直接子节点a
result = html.xpath('//li/a')
#选取父节点,然后获取其属性
result = html.xpath('//a[@href="link4.html"]/../@class')
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
#属性过滤
result = html.xpath('//li[@class="item-0"]')
#获取文本
result = html.xpath('//li[@class="item-0"]/a/text()') #逐层获取
result = html.xpath('//li[@class="item-0"]//text()') #获取子孙节点内部的所有文本
#获取属性
result = html.xpath('//li/a/@href')
#属性多值匹配
'''
text = <li class="li li-first"><a href="link.html">first time</a></li>
Exemple #15
0
def parse_detail_page(url):
    movie = {}
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode(
        'gbk', "ignore")  #text = response.text.encode("utf-8")
    html = etree.HTML(text)

    try:
        title = html.xpath(
            "//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    except:
        return ''

    movie['title'] = title
    Zoome = html.xpath("//div[@id='Zoom']")[0]  #return list
    imgs = Zoome.xpath(".//img/@src")
    #print(cover)
    if imgs:  ## 如果存在图片则继续执行
        cover = imgs[0]
        # screenshot=imgs[1]
        movie['cover'] = cover

        # movie['screenshot']=screenshot  not all movie has screenshot ,so discard for this moment

        def parse_info(info, rule):
            return info.replace(rule, "").strip()

        infos = Zoome.xpath(".//text()")

        for index, info in enumerate(infos):
            if info.startswith("◎年  代"):
                info = parse_info(info, "◎年  代")
                movie['year'] = info

            if info.startswith("◎国  家"):
                info = parse_info(info, "◎国  家")
                movie['country'] = info

            elif info.startswith("◎译  名"):
                info = parse_info(info, "◎译  名")
                movie['name_cn'] = info
            elif info.startswith("◎片  名"):
                info = parse_info(info, "◎片  名")
                movie['name_en'] = info
            elif info.startswith("◎产  地"):
                info = parse_info(info, "◎产  地")
                movie['country'] = info
            elif info.startswith("◎类  别"):
                info = parse_info(info, "◎类  别")
                movie['category'] = info
            elif info.startswith("◎语  言"):
                info = parse_info(info, "◎语  言")
                movie['language'] = info
            elif info.startswith("◎字  幕"):
                info = parse_info(info, "◎字  幕")
                movie['sub_title'] = info
            elif info.startswith("◎上映日期"):
                info = parse_info(info, "◎上映日期")
                movie['release_time'] = info
            elif info.startswith("◎IMDb评分"):
                info = parse_info(info, "◎IMDb评分")
                movie['imdb_score'] = info
            elif info.startswith("◎豆瓣评分"):
                info = parse_info(info, "◎豆瓣评分")
                movie['douban_score'] = info
            elif info.startswith("◎文件格式"):
                info = parse_info(info, "◎文件格式")
                movie['file_format'] = info
            elif info.startswith("◎视频尺寸"):
                info = parse_info(info, "◎视频尺寸")
                movie['ratio'] = info
            elif info.startswith("◎片  长"):
                info = parse_info(info, "◎片  长")
                movie['length'] = info
            elif info.startswith("◎导  演"):
                info = parse_info(info, "◎导  演")
                movie['director'] = info
            elif info.startswith("◎主  演"):
                info = parse_info(info, "◎主  演")
                actors = [info]
                for x in range(index + 1, len(infos)):
                    actor = infos[x].strip()
                    if actor.startswith("◎"):
                        break
                    actors.append(actor)
                movie['actors'] = actors
            elif info.startswith("◎简  介"):
                info = parse_info(info, "◎简  介")
                profiles = [info]
                for x in range(index + 1, len(infos)):
                    profile = infos[x].strip()
                    if profile.startswith("【下载地址】"):
                        break
                    profiles.append(profile)
                    movie['profiles'] = profiles

        try:
            download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]

        except:
            download_url = ''

        movie['download_url'] = download_url
        return movie