Python HTML Exemples, xml.etree.HTML Python Exemples

Exemple #1

0

Afficher le fichier

def search_course_4(sess, *args:list):
    # 接口
    url = "http://www.92daikan.com/tiku.aspx"

    # 获取接口参数
    try:
        res = sess.get(url, verify=False)
        res.raise_for_status()
        selector = etree.HTML(res.text)
        viewstate = selector.xpath('//*[@id="__VIEWSTATE"]/@value')
        viewstategenerator = selector.xpath(
            '//*[@id="__VIEWSTATEGENERATOR"]/@value')
        eventvalidation = selector.xpath(
            '//*[@id="__EVENTVALIDATION"]/@value')
    except requests.exceptions.RequestException as e:
        result = []
        for each in args:
            answer = []
            answer.append({'topic': str(e), 'correct': ''})
            result.append(answer)
        return result

    # 接口参数
    result = []
    data = {}
    data['__VIEWSTATE'] = viewstate
    data['__VIEWSTATEGENERATOR'] = viewstategenerator
    data['__EVENTVALIDATION'] = eventvalidation
    data['ctl00$ContentPlaceHolder1$gen'] = '查询'
    for i in range(len(args)):
        data['ctl00$ContentPlaceHolder1$timu'] = args[i]

        # post请求
        logging.info("Post to 92daikan. Question %d" % i)
        try:
            res = sess.post(url, data=data, verify=False)
            res.raise_for_status()
        except requests.exceptions.RequestException as e:
            answer = []
            answer.append({'topic': str(e), 'correct': ''})
            result.append(answer)
            continue

        # 处理结果
        logging.info("Processing result")
        answer = []
        selector = etree.HTML(res.text)
        temp = {}
        temp['topic'] = args[i]
        temp['correct'] = selector.xpath('//*[@id="daan"]/text()')[0]
        if temp['correct'] != '未找到答案':
            answer.append(temp)
        result.append(answer)

    logging.info("Return result: %s" % result)

    return result

Exemple #2

0

Afficher le fichier

Fichier : demo2.py Projet : lmgsanm/limugen

 def download_chart(self, image_dir, itemids, stime, etime):
     # 此url是获取图片是的，请注意饼图的URL 和此URL不一样，请仔细观察！
     url = "http://company.monitor.com/chart.php";
     # 折线图的大小
     url_par = {}
     url_par = {"width": 1778, "height": 300, "itemids": itemids}
     # 开始日期、结束日期从str转换为datetime
     stime = datetime.datetime.strptime(stime, "%Y-%m-%d")
     etime = datetime.datetime.strptime(etime, "%Y-%m-%d")
     # 计算period
     diff_sec = etime - stime
     period = diff_sec.days * 24 * 3600 + diff_sec.seconds
     url_par["period"] = period
     # stime转换str
     stime = stime.strftime('%Y%m%d%H%M%S')
     url_par["stime"] = stime
     key = url_par.keys()
     data = urllib.parse.urlencode(url_par).encode(encoding='UTF8')
     request = urllib.request.Request(url, data)
     url = self.urlOpener.open(request)
     image = url.read()
     html = requests.get('http://zabbix.uce.local/history.php?action=showgraph&itemids[]={}'.format(itemids)).text
     page = etree.HTML(html)
     hostname_itemname = page.xpath('//div[@class="header-title"]/h1/text()')[0].split(':')
     hostname = hostname_itemname[0]
     hostname_itemname.pop(0)
     itemname = '_'.join(hostname_itemname).replace('/', '_')
     imagename = "{}\{}_{}_{}_({}).png".format(image_dir, hostname, stime, etime.strftime('%Y%m%d%H%M%S'), itemname)
     f = open(imagename, 'wb')
     f.write(image)

Exemple #3

0

Afficher le fichier

Fichier : pa_meizi.py Projet : yesong17/springboot

 def parse_html(self, url):
     html = self.get_html(url).decode()
     parse_obj = etree.HTML(html)
     href_list = parse_obj.xpath(
         '//div[@class="all"]/ul[@class="archives"]/li/p[@class="url"]/a/@href'
     )
     print("href_list:", href_list)
     self.write_html(href_list)

Exemple #4

0

Afficher le fichier

 def content(self):
     while True:
         #从响应队列中以此获取html源码
         html = self.resQueue.get()
         parseHtml = etree.HTML(html)
         r_list = parseHtml.xpath('//div[@class=""j-r-list-c-desc""]/a/text()')
         for r in r_list:
             print(r+"\n")
         self.resQueue.task_done()

Exemple #5

0

Afficher le fichier

Fichier : validateCode.py Projet : hongbozheng/PythonScrape

def parse(s, html, idx):
    result = {}

    tree = etree.HTML(html)
    try:
        result['lt'] = tree.xpath('//input[@name="lt"]/@value')[0]
        result['execution'] = tree.xpath('//input[@name="execution"]/@value')[0]
        result['path'] = tree.xpath('//form[@id="fm1"]/@action')[0]
    except IndexError, e:
        return None

Exemple #6

0

Afficher le fichier

Fichier : views.py Projet : harryue/bubble

    def crawlerFeixiaohao(self):
        response = self.get_data('http://www.feixiaohao.com')

        html = etree.HTML(response.text)
        tbody = html.xpath('//*[@id="table"]/tbody/tr')
        for item in tbody:
            id = item.xpath('@id')[0]
            name = item.xpath('td[2]/a/img/@alt')[0]
            marketValue = item.xpath('td[3]/text()')[0]
            price = item.xpath('td[4]/a/text()')[0]
            circulation = item.xpath('td[5]/text()')[0]
            self.save_obj(id, name, name, price, circulation, marketValue, '',
                          '', '', 'FXH')

Exemple #7

0

Afficher le fichier

Fichier : TaoBaoPic.py Projet : wsaicyj/WebScrap

    def getItemDetail(self, link, save_img_path):
        """从宝贝的详情链接里 爬取图片
        Arguments:
            link {String} -- [宝贝详情链接]
        """
        newDriver = webdriver.Chrome()
        newDriver.get(link)
        time.sleep(self.sleep_time)

        print(newDriver.title)

        img_dir_path = save_img_path + newDriver.title.encode('utf-8')
        if True == self.mkdir(img_dir_path):
            print('创建宝贝目录成功')

        html = newDriver.page_source.encode('utf-8')
        selector = etree.HTML(html)

        # 封面图
        J_ULThumb = selector.xpath("//div[@class='tb-gallery']/ul/li")
        index = 0
        for li in J_ULThumb:
            # 替换图片 从50*50 至 400 * 400
            if len(li.xpath("./div/a/img/@data-src")) < 1:
                continue
            small_pic = li.xpath("./div/a/img/@data-src")[0]
            common_pic = 'https:' + small_pic.replace('50x50', '400x400')
            thumb_title = str('封面图') + str(index)
            print(thumb_title)
            # self.saveImg(img_dir_path, common_pic, thumb_title.decode('utf-8'))
            index += 1

        # 爬取里面所有图片
        all_img = selector.xpath(
            "//div[@id='J_DivItemDesc']//descendant::img/@src")
        print(all_img)
        index = 0
        for img in all_img:
            # imglink = ''
            if img.startswith('http') is True:
                imglink = img
            else:
                imglink = 'https:' + img

            self.saveImg(img_dir_path, imglink, str(index))
            index += 1

        newDriver.quit()

Exemple #8

0

Afficher le fichier

Fichier : 01-qiushi_single.py Projet : zy723/Spider_scrapy

    def analysis_data(self, data):
        """
        解析数据
        :return:
        """
        html_data = etree.HTML(data)
        # 获取帖子数量
        div_list = html_data.xpath('//div[@id="content-left"]/div')

        name_list = []
        # 遍历出每页中存在的帖子数
        for div in div_list:
            nick_name = div.xpath('.//h2/text()')[0]
            print(nick_name.strip())
            name_list.append(nick_name.strip())
        return name_list

Exemple #9

0

Afficher le fichier

def search_course_2(sess, *args: list):
    if not isinstance(sess, requests.Session):
        args = list(args)
        args.insert(0, sess)
        args = tuple(args)
        sess = requests.Session()

    # 接口
    url = "https://cx.poxiaobbs.com/index.php"

    # 接口参数
    data = {}
    result = []
    for i in range(len(args)):
        data['tm'] = args[i]

        # post请求
        logging.info("Post to poxiao bbs php. Question %d" % i)
        try:
            res = sess.post(url, data=data, verify=False)
            res.raise_for_status()
        except requests.exceptions.RequestException as e:
            answer = []
            answer.append({'topic': str(e), 'correct': ''})
            result.append(answer)
            continue

        # 处理结果
        logging.info("Processing result")
        answer = []
        selector = etree.HTML(res.text)
        answer_div = selector.xpath('/html/body/div[1]/div[@class="ans"]')
        for each in answer_div:
            temp = {}
            answer_text = each.xpath('string(.)')\
                .strip().replace('  ', '').replace('\n', '')
            if "答案：" in answer_text:
                temp['topic'] = answer_text.split("答案：")[0]
                temp['correct'] = answer_text.split("答案：")[1]
                answer.append(temp)
        result.append(answer)

    logging.info("Return result: %s" % result)

    return result

Exemple #10

0

Afficher le fichier

Fichier : validateCode.py Projet : hongbozheng/PythonScrape

def login(usr, pwd, idx):
    s = requests.Session()

    r = s.get('https://passport.csdn.net/account/login',
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0', 'Host': 'passport.csdn.net', })

    while True:
        res = parse(s, r.text, idx)
        if res == None:
            return False
        url = 'https://passport.csdn.net' + res['path']
        form = {'username': usr, 'password':pwd, '_eventId':'submit', 'execution':res['execution'], 'lt':res['lt'],}
        if res.has_key('validateCode'):
            form['validateCode'] = res['validateCode']
        s.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Host': 'passport.csdn.net',
            'Origin': 'https://passport.csdn.net',
            'Referer': 'https://passport.csdn.net/account/login',
            'Upgrade-Insecure-Requests': 1,
            })
        r = s.post(url, data=form)

        tree = etree.HTML(r.text)
        err_strs = tree.xpath('//span[@id="error-message"]/text()')
        if len(err_strs) == 0:
            return True
        err_str = err_strs[0]
        print err_str
        err = err_str.encode('utf8')

        validate_code_err = '验证码错误'
        usr_pass_err = '帐户名或登录密码不正确，请重新输入'
        try_later_err = '登录失败连续超过5次，请10分钟后再试'

        if err[:5] == validate_code_err[:5]:
            pass
        elif err[:5] == usr_pass_err[:5]:
            return False
        elif err[:5] == try_later_err[:5]:
            return False
        else:
            return True

Exemple #11

0

Afficher le fichier

def downGubenFromEastmoney():
    """ 从东方财富下载总股本变动数据
    url: 
    """
    pass
    ts_code = '600000.SH'
    # startDate = '2019-04-01'
    bs.login()
    # from misc import usrlGubenEastmoney
    # urlGubenEastmoney('600000')
    gubenURL = urlGubenEastmoney(ts_code)
    # req = getreq(gubenURL, includeHeader=True)
    req = getreq(gubenURL)
    guben = urlopen(req).read()

    gubenTree = etree.HTML(guben)
    # //*[@id="lngbbd_Table"]/tbody/tr[1]/th[3]
    # gubenData = gubenTree.xpath('//tr')
    gubenData = gubenTree.xpath('''//html//body//div//div
                                //div//div//table//tr//td
                                //table//tr//td//table//tr//td''')
    date = [gubenData[i][0].text for i in range(0, len(gubenData), 2)]
    date = [datetime.strptime(d, '%Y%m%d') for d in date]
    #     print date
    totalshares = [
        gubenData[i + 1][0].text for i in range(0, len(gubenData), 2)
    ]
    #     print totalshares
    #     t = [i[:-2] for i in totalshares]
    #     print t
    try:
        totalshares = [float(i[:-2]) * 10000 for i in totalshares]
    except ValueError as e:
        # logging.error('ts_code:%s, %s', ts_code, e)
        print('ts_code:%s, %s', ts_code, e)
    #     print totalshares
    gubenDf = DataFrame({
        'ts_code': ts_code,
        'date': date,
        'totalshares': totalshares
    })
    return gubenDf

Exemple #12

0

Afficher le fichier

Fichier : bug.py Projet : Liar-zzy/webbug_51job

def get_data(text):
    html = etree.HTML(text)
    # 使用xpath格式获取数据
    divs = html.xpath('//*[@id="resultList"]/div[@class="el"]')

    for div in divs:
        job_title = div.xpath('./p/span/a/@title')
        job_company = div.xpath('./span[1]/a/@title')
        job_address = div.xpath('./span[2]/text()')
        job_salary = div.xpath('./span[3]/text()')
        job_href = div.xpath('./p/span/a/@href')

        job_title = job_title[0] if len(job_title) > 0 else ''
        job_company = job_company[0] if len(job_company) > 0 else ''
        job_address = job_address[0] if len(job_address) > 0 else ''
        job_salary = job_salary[0] if len(job_salary) > 0 else ''
        job_href = job_href[0] if len(job_href) > 0 else ''

        job_info = []  # 存储数据
        job_info.append(job_title)
        job_info.append(job_company)
        job_info.append(job_address)
        job_info.append(job_salary)

        # 读取职位的网页 将该网页继续解析 得到求职页面具体信息
        #job_info.append(job_href)

        job_body, job_name = getADetails(job_href)

        #print(job_body)

        #print(job_name)

        job_info.append(job_body)
        job_info.append(job_name)
        write_excel(job_info, 'Simple_spider_NJ2.xls')  # 写进sheet
        job_info = []  # 每写一次都需要清空
        time.sleep(1)

Exemple #13

0

Afficher le fichier

Fichier : 02-qiushi_threading.py Projet : zy723/Spider_scrapy

    def analysis_data(self):
        """
        解析数据
        :return:
        """
        while True:
            data = self.data_queue.get()

            html_data = etree.HTML(data)
            # 获取帖子数量
            div_list = html_data.xpath('//div[@id="content-left"]/div')

            name_list = []
            # 遍历出每页中存在的帖子数
            for div in div_list:
                nick_name = div.xpath('.//h2/text()')[0]
                print(nick_name.strip())
                name_list.append(nick_name.strip())
            # return name_list
            # 数据入栈
            self.data_queue.put(name_list)

            self.data_queue.task_done()

Exemple #14

0

Afficher le fichier

Fichier : TaoBaoPic.py Projet : wsaicyj/WebScrap

    def getItem(self):
        """爬取当前页面的每个宝贝，
           提取宝贝名字，价格，标题等信息
        """

        html = self.driver.page_source.encode('utf-8')
        selector = etree.HTML(html)
        itemList = selector.xpath("//div[@class='item3line1']")

        # 循环遍历该页所有商品
        index = 0
        for item3line1 in itemList:
            dl = item3line1.xpath("./dl")
            for item in dl:
                link = 'https:' + item.xpath("./dt/a/@href")[0]
                photo = 'https:' + item.xpath("./dt/a/img/@src")[0]
                title = item.xpath("./dd/a/text()")[0]

                res = {'link': link, 'photo': photo, 'title': title}

                # 进入宝贝详情页 开始爬取里面的图片资料
                self.getItemDetail(link, '')

        # 获取分页信息
        pagination = selector.xpath(
            "//div[@class='pagination']/a[contains(@class, 'J_SearchAsync') and contains(@class, 'next')]/@href"
        )
        print(pagination)
        print('正在准备切换分页')
        if len(pagination) == 0:
            print('没有下一页了')

        else:
            print('加载下一页内容')
            self.site_url = 'https:' + pagination[0]
            print(self.site_url)
            self.getPage()

Exemple #15

0

Afficher le fichier

    def parse0(self, html_packed):
        """ 网贷新闻、平台动态、网贷专栏 """
        seeds = []
        try:
            tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore'))
            txt = 'listbox20' if html_packed['category'] == '网贷新闻' else (
                'listbox28'
                if html_packed['category'] == '平台动态' else 'listbox26')
            divs = tree.xpath(
                '//div[@id="%s"]//div[@class="mod-leftfixed mod-news clearfix"]'
                % txt)
            for div in divs:
                try:
                    seed = {
                        'spider':
                        'Spider_bbsp2peye.crawl0',
                        'category':
                        html_packed['category'],
                        'pubTime':
                        time.strftime('%Y-%m-%d %H:%M', time.localtime())
                    }
                    # URL
                    url = div.xpath(
                        './/div[@class="main"]//div[@class="hd"]/a/@href')
                    if len(url) == 0:
                        url = div.xpath(
                            './/div[@class="sub"]/div[@class="inner"]/a/@href')
                    if len(url) > 0:
                        seed['url'] = url[0]

                        # 描述
                        description = ' '.join(
                            div.xpath(
                                './/div[@class="main"]//div[@class="bd"]//text()'
                            )).strip()
                        if len(description) > 0:
                            seed['description'] = description

                        # 封面图片
                        coverImg = div.xpath(
                            './/div[@class="sub"]/div[@class="inner"]/a/img[@class="tn"]/@src'
                        )
                        if len(coverImg) > 0:
                            seed['coverImg'] = coverImg[0]

                        # 发表时间
                        pubTime = div.xpath(
                            './/div[@class="main"]//div[@class="fd-left"]//span'
                        )
                        if len(pubTime) > 0:
                            pubTime = pubTime[-1].xpath('./text()')
                            if len(pubTime) > 0 and pubTime[0].strip(
                            ).startswith('20'):
                                seed['pubTime'] = pubTime[0].strip()

                        if seed['pubTime'] > html_packed['end'].replace(
                                '-0', '-'):
                            continue
                        elif seed['pubTime'] < html_packed['start'].replace(
                                '-0', '-'):
                            break
                        seeds.append(seed)
                except Exception as e:
                    self.logger.error(str(e))
            else:
                if len(divs) > 0:
                    nextURL = tree.xpath(
                        '//div[@class="mod-page"]/div[@class="c-page"]/a[@title="下一页"]/@href'
                        .decode('utf8'))
                    if len(nextURL) > 0:
                        seeds.append({
                            'url':
                            'http://news.p2peye.com' + nextURL[0]
                            if nextURL[0].startswith('/') else nextURL[0],
                            'category':
                            html_packed['category'],
                            'start':
                            html_packed['start'],
                            'end':
                            html_packed['end'],
                            'spider':
                            'Spider_bbsp2peye.crawl0',
                            'dont_filter':
                            True,
                        })
        except Exception as e:
            self.logger.error(str(e))
        return [], seeds

Exemple #16

0

Afficher le fichier

    def parse2(self, html_packed):
        """ 曝光台 """
        seeds = []
        try:
            tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore'))
            lis = tree.xpath('//ul[@role-parent="newloadmore"]/li')
            for li in lis:
                try:
                    seed = {
                        'spider':
                        'Spider_bbsp2peye.crawl0',
                        'start':
                        html_packed['start'],
                        'end':
                        html_packed['end'],
                        'category':
                        html_packed['category'],
                        'pubTime':
                        time.strftime('%Y-%m-%d %H:%M', time.localtime())
                    }

                    # URL
                    url = li.xpath('./a[@class="newlistbox"]/@href')
                    if len(url) > 0:
                        seed['url'] = 'http://www.p2peye.com' + url[0] if url[
                            0].startswith('/thread') else url[0]

                        # 描述
                        description = ' '.join(
                            li.xpath(
                                './a/div[@class="synopsis"]/text()')).strip()
                        if len(description) > 0:
                            seed['description'] = description

                        # 发表时间
                        pubTime = li.xpath(
                            './a/div/span[@class="time"]/text()')
                        if len(pubTime) > 0:
                            seed['pubTime'] = pubTime[0].strip()
                        if seed['pubTime'] > html_packed['end'].replace(
                                '-0', '-'):
                            continue
                        elif seed['pubTime'] < html_packed['start'].replace(
                                '-0', '-'):
                            break
                        seeds.append(seed)
                except Exception as e:
                    self.logger.error(str(e))
            else:
                if len(lis) > 0:
                    pageNum = re.findall('forum-\d+-(\d+)\.html',
                                         html_packed['url'])
                    if len(pageNum) > 0:
                        nextURL = html_packed['url'].replace(
                            '-%s.html' % pageNum[0],
                            '-%s.html' % (int(pageNum[0]) + 1))
                        seeds.append({
                            'url': nextURL,
                            'category': html_packed['category'],
                            'start': html_packed['start'],
                            'end': html_packed['end'],
                            'spider': 'Spider_bbsp2peye.crawl0',
                            'dont_filter': True,
                        })
        except Exception as e:
            self.logger.error(str(e))
        return [], seeds

Exemple #17

0

Afficher le fichier

    def parse3(self, html_packed):
        """ 曝光帖子(曝光台帖子) """
        fields = []
        try:
            tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore'))
            articleItem = ArticleItem()

            # 文章ID
            aid = re.findall('p2peye\.com/(thread-.*?)\.html',
                             html_packed['url'])
            if len(aid) == 0:
                aid = re.findall('p2peye\.com/(article-.*?)\.html',
                                 html_packed['url'])
            if len(aid) > 0:
                articleItem.aid = aid[0]
                articleItem._id = 'p2peye-%s' % aid[0]

            # 文章URL
            articleItem.url = html_packed['url']

            # 文章标题
            title = tree.xpath('//meta[@name="keywords"]/@content')
            if len(title) > 0:
                articleItem.title = title[0]

            # 文章内容
            content = tree.xpath('//div[@class="typeoption"]/table[@summary]')
            if len(content) > 0:
                articleItem.content = etree.tounicode(content[0])

            # 文章描述
            description = tree.xpath('//meta[@name="description"]/@content')
            if len(description) > 0:
                articleItem.description = description[0]
            elif 'description' in html_packed.keys():
                articleItem.description = html_packed['description']

            # 发布时间
            pubTime = tree.xpath(
                '//meta[@property="og:release_date"]/@content')
            if len(pubTime) > 0:
                try:
                    articleItem.pubTime = int(
                        time.mktime(time.strptime(pubTime[0],
                                                  '%Y-%m-%d %H:%M')))
                except Exception as e:
                    pass
            else:
                pubTime = ' '.join(
                    tree.xpath(
                        '//div[@class="authi"]/em[contains(@id, "authorposton")]/text()'
                    ))
                pubTime = re.findall('20\d.*?\d*:\d*', pubTime)
                if len(pubTime) > 0:
                    try:
                        articleItem.pubTime = int(
                            time.mktime(
                                time.strptime(pubTime[0], '%Y-%m-%d %H:%M')))
                    except Exception as e:
                        pass

            # 作者昵称
            author = tree.xpath(
                '//div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()'
            )
            if len(author) == 0:
                author = tree.xpath(
                    '//div[@class="pls favatar"]//div/strong/a[@class="xi2"]/text()'
                )
            if len(author) > 0:
                articleItem.authorNickname = author[0].strip()

            # 点赞数
            praise = tree.xpath(
                '//a[@id="recommend_add"]/i/span[@id="recommendv_add"]/text()')
            if len(praise) > 0:
                try:
                    articleItem.praiseCount = int(praise[0])
                except Exception as e:
                    pass

            # 踩
            refuseCount = tree.xpath(
                '//a[@id="recommend_subtract"]/i/span[@id="recommendv_subtract"]/text()'
            )
            if len(refuseCount) > 0:
                try:
                    articleItem.refuseCount = int(refuseCount[0])
                except Exception as e:
                    pass

            spans = tree.xpath('//td/div[@class="hm ptn"]/span[@class="xi1"]')
            if len(spans) == 2:
                # 阅读数
                try:
                    articleItem.readCount = int(spans[0].xpath('./text()'))
                except Exception as e:
                    pass

                # 评论数
                try:
                    articleItem.commentCount = int(spans[1].xpath('./text()'))
                except Exception as e:
                    pass

            # 分享数
            shareCount = tree.xpath(
                '//a[@class="sharep"]/i/span[@id="sharenumber"]')
            if len(shareCount) > 0:
                try:
                    articleItem.shareCount = int(shareCount[0])
                except Exception as e:
                    pass

            # 收藏数
            collectCount = tree.xpath(
                '//a[@id="k_favorite"]/i/span[@id="favoritenumber"]/text()')
            if len(collectCount) > 0:
                try:
                    articleItem.collectCount = int(collectCount[0])
                except Exception as e:
                    pass

            # 分类
            if 'category' in html_packed.keys():
                articleItem.classification = html_packed['category']

            # 抓取来源
            articleItem.crawlSource = '网贷天眼'

            # 抓取时间
            articleItem.crawlTimestamp = html_packed[
                'time_crawl'] if 'time_crawl' in html_packed.keys() else int(
                    time.time())

            field = dict(articleItem.__dict__)
            field['pipeline_dbType'] = 'mongo'
            fields.append(field)
        except Exception as e:
            self.logger.error(str(e))
        return fields, []

Exemple #18

0

Afficher le fichier

    def parse1(self, html_packed):
        """ 文章(平台动态、网贷专栏、忘带新闻的帖子) """
        fields = []
        try:
            tree = etree.HTML(html_packed['html'].decode('gbk', 'ignore'))
            articleItem = ArticleItem()

            # 文章ID
            aid = re.findall('p2peye\.com/(thread-.*?)\.html',
                             html_packed['url'])
            if len(aid) == 0:
                aid = re.findall('p2peye\.com/(article-.*?)\.html',
                                 html_packed['url'])
            if len(aid) > 0:
                articleItem.aid = aid[0]
                articleItem._id = 'p2peye-%s' % aid[0]

            # 文章链接
            articleItem.url = html_packed['url']

            # 文章标题
            title = tree.xpath('//div[@id="ct"]//h1[@id="plat-title"]/text()')
            if len(title) == 0:
                title = tree.xpath('//meta[@name="keywords"]/@content')
            if len(title) > 0:
                articleItem.title = title[0].strip()

            # 文章内容
            content = tree.xpath('//div[@id="ct"]//td[@id="article_content"]')
            if len(content) > 0:
                articleItem.content = etree.tounicode(content[0])

            # 文章描述
            description = tree.xpath('//meta[@name="description"]/@content')
            if len(description) > 0:
                articleItem.description = ' '.join(description).strip()
            elif 'description' in html_packed.keys():
                articleItem.description = html_packed['description']

            # 封面缩略图
            if 'coverImg' in html_packed.keys():
                articleItem.coverImg = html_packed['coverImg']

            txt = ' '.join(
                tree.xpath(
                    '//div[@id="ct"]//div[@class="c-a-inf"]//text()')).replace(
                        '\t', '').replace('\r',
                                          '').replace('\n',
                                                      '').replace('  ', '')
            # 发布时间
            if 'pubTime' in html_packed.keys():
                try:
                    articleItem.pubTime = int(
                        time.mktime(
                            time.strptime(html_packed['pubTime'],
                                          '%Y-%m-%d %H:%M')))
                except Exception as e:
                    pass
            else:
                pubTime = re.findall('发布时间: ?(20.*?\d+:\d+)'.decode('utf8'),
                                     txt)
                if len(pubTime) > 0:
                    articleItem.pubTime = int(
                        time.mktime(time.strptime(pubTime[0],
                                                  '%Y-%m-%d %H:%M')))

            # 作者
            authorNickname = re.findall('原作者:(.*?) '.decode('utf8'), txt)
            if len(authorNickname) == 0:
                authorNickname = re.findall('发布者:(.*?)\|'.decode('utf8'), txt)
            if len(authorNickname) > 0:
                articleItem.authorNickname = authorNickname[0].split(
                    '|')[0].split('来自')[0].strip()

            # 点赞
            praiseCount = tree.xpath(
                '//div[@id="ct"]//div[@id="click_div"]//a[@title="给力"]/span/text()'
                .decode('utf8'))
            if len(praiseCount) > 0:
                articleItem.praiseCount = int(praiseCount[0].strip())

            # 踩
            refuseCount = tree.xpath(
                '//div[@id="ct"]//div[@id="click_div"]//a[@title="没劲"]/span/text()'
                .decode('utf8'))
            if len(refuseCount) > 0:
                articleItem.refuseCount = int(refuseCount[0].strip())

            # 阅读量
            readCount = re.findall('浏览量: ?(\d+)'.decode('utf8'), txt)
            if len(readCount) > 0:
                articleItem.readCount = int(readCount[0])
            elif 'readCount' in html_packed.keys():
                articleItem.readCount = html_packed['readCount']

            # 评论数
            if 'commentCount' in html_packed.keys():
                articleItem.commentCount = html_packed['commentCount']

            # 分类
            if 'category' in html_packed.keys():
                articleItem.classification = html_packed['category']

            # 文章来源
            source = re.findall('来自: ?(.*?)[ \|]'.decode('utf8'), txt)
            if len(source) > 0:
                articleItem.source = source[0]

            # 抓取来源
            articleItem.crawlSource = '网贷天眼'

            # 抓取时间
            articleItem.crawlTimestamp = html_packed['time_crawl']

            field = dict(articleItem.__dict__)
            field['pipeline_dbType'] = 'mongo'
            fields.append(field)

        except Exception as e:
            self.logger.error(str(e))
        return fields, []

Exemple #19

0

Afficher le fichier

    def parse4(self, html_packed):
        """ 平台官方动态 """
        seeds = []
        try:
            tree = etree.HTML(html_packed['html'])
            lis = tree.xpath(
                '//div[@class="mod-list"]/ul/li[@class="item clearfix"]')
            for li in lis:
                try:
                    seed = {
                        'spider':
                        'Spider_bbsp2peye.crawl0',
                        'pubTime':
                        time.strftime('%Y-%m-%d %H:%M', time.localtime())
                    }

                    # URL
                    url = li.xpath('./div[@class="mc-hd"]/a/@href')
                    if len(url) > 0:
                        seed['url'] = url[0]

                        # 描述
                        description = ' '.join(
                            li.xpath(
                                './div[@class="mc-bd"]/span/text()')).strip()
                        if len(description) > 0:
                            seed['description'] = description

                        commentCount = li.xpath(
                            './div//span[@class="ft-comment"]/text()')
                        try:
                            seed['commentCount'] = int(commentCount[0])
                        except Exception as e:
                            pass

                        readCount = li.xpath(
                            './div//span[@class="ft-see"]/text()')
                        try:
                            seed['readCount'] = int(readCount[0])
                        except Exception as e:
                            pass

                        # 发表时间
                        pubTime = li.xpath(
                            './div/span[contains(@class, "time")]/text()')
                        if len(pubTime) > 0:
                            seed['pubTime'] = ':'.join(
                                pubTime[0].strip().split(':')[:-1]
                            )  # 解析出来的是2018-01-09 14:29:00，只保留到分，即2018-01-09 14:29
                        if seed['pubTime'] > html_packed['end']:
                            continue
                        elif seed['pubTime'] < html_packed['start']:
                            break
                        seed['category'] = '官方动态'
                        seeds.append(seed)
                except Exception as e:
                    self.logger.error(str(e))
            else:
                if len(lis) > 0:
                    nextURL = tree.xpath(
                        '//div[contains(@class, "page")]/a[contains(text(), "下一页")]/@href'
                        .decode('utf8'))
                    if len(nextURL) > 0:
                        nextURL = nextURL[0]
                        if nextURL.startswith('/gfdt'):
                            nextURL = html_packed['url'].split(
                                '/gfdt')[0] + nextURL
                        seeds.append({
                            'url': nextURL,
                            'start': html_packed['start'],
                            'end': html_packed['end'],
                            'spider': 'Spider_bbsp2peye.crawl0',
                            'dont_filter': True,
                        })
        except Exception as e:
            self.logger.error(str(e))
        return [], seeds

Exemple #20

0

Afficher le fichier

Fichier : taobao_selemin.py Projet : yag8009/office_test_team

except Exception as e:
    print(e)
    driver.execute_script("window.stop()")

time.sleep(3)
driver.refresh()

try:
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.ID, 'page')))
except:
    print('over')
    driver.execute_script("window.stop()")

html = driver.page_source

selector = etree.HTML(html)

list1 = selector.xpath('//div[contains(@class,"item-mod__trade-order")]')

try:
    for i in list1:
        order_id = i.xpath('table[1]/tbody/tr/td[1]/label/span[3]/text()')[0] # 订单号
        order_time = i.xpath('table[1]/tbody/tr/td[1]/label/span[6]/text()')[0] # 下单时间
        price = i.xpath('table[2]/tbody/tr/td[2]/div/p/span[2]/text()')[0] # 价格
        all_price = i.xpath('table[2]/tbody/tr/td[7]/div/div[1]/p/strong/span[2]/text()')[0] # 总价
        saler_title = i.xpath('table[2]/tbody/tr/td[5]/div/p[1]/a/text()')[0] # 商品名
        name = i.xpath('table[2]/tbody/tr/td[5]/div/p[1]/a/text()')[0] # 买家账户名
        url = i.xpath('table[2]/tbody/tr/td[6]/div/div/p[1]/a/@href')[0] # 商品详情url
        url = 'https:' + url
        driver.get(url)
        time.sleep(3)

Exemple #21

0

Afficher le fichier

Fichier : mydemo2.py Projet : TomYu1991/MyProjects

#Author:Tom_Fish
#-*- coding:utf-8 -*-
import requests
from xml import etree
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
}
res = requests.get('http://kaijiang.500.com/shtml/dlt/18001.shtml',
                   headers=headers)
html = etree.HTML(res.text)
result = etree.tostring(html)
print(result)
print('hello world')