Python HTML.HTMLの例、lxml.etree.HTML.HTML Pythonの例

コード例 #1

0

ファイルを表示

 def __init__(self, url):
     self.url = url.lstrip('http://').lstrip('https://')
     self.host = urlsplit(url).netloc
     req = Request(url,
                   headers={
                       'content-type': 'text/html',
                       'User-Agent': 'Mozilla/5.0'
                   })
     res = urlopen(req)
     self.access = localtime()
     date = res.headers['Last-Modified']
     self.date = None if date == None else strptime(
         date, '%a, %d %b %Y %H:%M:%S %Z')
     page = res.read()
     html = HTML(page.decode(detect(page)['encoding']))
     author = html.xpath(
         '//meta[@name="author"or@property="author"][1]/@content')
     self.author = None if author == [] else author[0]
     site = html.xpath('//meta[@property="og:site_name"][1]/@content')
     self.site = None if site == [] else site[0]
     title = html.xpath('//meta[@property="og:title"][1]/@content')
     self.title = html.xpath(
         '//title[1]/text()')[0] if title == [] else title[0]

コード例 #2

0

ファイルを表示

    def parse_serp(self, html):
        elements = HTML(html)
        container = CSSSelector('div#isr_mc')(elements)[0]
        results = CSSSelector('div.rg_di')(container)

        for result in results:
            result_containers = CSSSelector('a.rg_l')(result)
            if not result_containers:
                continue

            result_container = result_containers[0]
            result_href = result_container.get('href')
            if not result_href:
                continue

            double_quoted_link = self.link_re.match(result_href).group(1)
            link = self.double_unquote(double_quoted_link)

            double_quoted_visible_link = self.visible_link_re.match(
                result_href).group(1)
            visible_link = self.double_unquote(double_quoted_visible_link)

            yield link, visible_link

コード例 #3

0

ファイルを表示

ファイル: scheduler.py プロジェクト: snamper/mountings

 def get_yiparts_detail(self):
     sql = 'select * from yiparts'
     results = self.db.find_all(sql)
     for res in results:
         url = 'http://www.yiparts.com/parts/{yiparts_name_en}/'.format(
             yiparts_name_en=res[2])
         print(url)
         response = self.download.get_html(url)
         doc = HTML(response)
         names = doc.xpath('//div[@id="sort2"]/div/div/a/span[2]/text()')
         name_ens = doc.xpath('//div[@id="sort2"]/div/div/a/@href')
         imgs = doc.xpath('//div[@id="sort2"]/div/div/a/span[1]/img/@src')
         for name, name_en, img in zip(names, name_ens, imgs):
             item_name = name.strip()
             item_name_en = name_en[11:-1]
             item_img = img
             sql = 'insert into yiparts_detail(pid, detail_name, detail_name_en, detail_img) VALUES ("{pid}", "{detail_name}", "{detail_name_en}", "{detail_img}")'.format(
                 pid=res[0],
                 detail_name=item_name,
                 detail_name_en=item_name_en,
                 detail_img=item_img)
             print(sql)
             self.db.save(sql)

コード例 #4

0

ファイルを表示

    def get_keyword(self, response):
        html = HTML(response.text)
        url_list = html.xpath('//a/@href')
        self.parse_keyword(response)
        exits_url = []
        for url in url_list:
            if re.match('^/.*?aspx$', url):
                two_url = 'http://www.aliwuxi.com' + url
            elif re.match('http://.*?aspx$', url):
                two_url = url
            else:
                continue

            if two_url in exits_url:
                continue
            else:
                exits_url.append(two_url)
            print(two_url)
            two_response = self.down.get_html(two_url)
            self.parse_keyword(two_response)
        self.kw_list = list(filter(None, self.kw_list))
        self.kw_list = list(set(self.kw_list))
        return self.kw_list

コード例 #5

0

ファイルを表示

def deal(json_obj):
    # print(json.dumps(json_obj))
    for data in json_obj['pageRow']:
        if 'article_id' in data and data['article_id']:
            article_id = data['article_id']
            _type = 'perio'
        else:
            article_id = data['report_id']
            _type = 'tech'
        link = 'http://www.wanfangdata.com.cn/details/detail.do?_type={_type}&id={article_id}'.format(_type=_type,article_id=article_id)
        print(link)
        response = requests.get(link)
        # print(response.text)

        html = HTML(response.text)
        title = html.xpath('string(//div[@class="left_con_top"]//div[@class="title"]/text())').strip()
        address_xpath_list = html.xpath('//ul[@class="info"]//div[@class="info_right info_right_newline"]/a/text()')
        address_list = []
        flag = True
        for addressStr in address_xpath_list:
            addressStr = addressStr.replace(' ','').replace(' ','')
            # print(addressStr)
            searchRes = re.search('.*?(,|，)(.*?)(,|，)\d+$', addressStr)
            if searchRes:
                address = searchRes.group(2)
                if ',' in address:
                    address = address.split(',')[-1]
                save_res = title+','+address+'\n'
                print(save_res)
                flag = False
                with open('结果.csv','a',encoding='gbk') as f:
                    f.write(save_res)

        if flag:
            print('无匹配数据')
        print('暂停10秒')
        time.sleep(10)

コード例 #6

0

ファイルを表示

ファイル: WECHAT.py プロジェクト: ILKKAI/dataETL

    def req_for_name(self, wechat_id):
        url = self.url.format(wechat_id)
        # response = WanDou().http_client(url=url, param=self.headers)
        resp1 = requests.get(
            url=
            r"http://h.wandouip.com/get/ip-list?pack=853&num=1&xy=1&type=2&lb=\r\n&mr=1&"
        )
        resp2 = resp1.json()["data"][0]
        # print(resp2)
        # resp1.close()
        time.sleep(2)
        try:
            response = requests.get(
                url=url,
                headers=self.headers,
                proxies={"http": "{}:{}".format(resp2["ip"], resp2["port"])})
        except Exception as e:
            print(1, e)
            self.logger.info("error ip: {}".format(resp2))
            time.sleep(5)
            return self.req_for_name(wechat_id)

        html = HTML(response.content.decode())
        # response.close()
        name = html.xpath('//p[@class="tit"]/a/text()')
        if name:
            # print(name)
            self.error_count = 0
            return name[0]
        else:
            self.error_count += 1
            if self.error_count == 5:
                self.logger.info("wetchat id error: \"{}\"".format(wechat_id))
                return "None"
            else:
                time.sleep(2)
                self.req_for_name(wechat_id)

コード例 #7

0

ファイルを表示

def start():
    with open('url.txt') as f:
        results = f.readlines()
        for res in results:
            try:
                url = res.strip()
                print(url)
                response = requests.get(url)
                # print(response.text)
                html = HTML(response.text)

                comName = html.xpath(
                    'string(//table[1]//tr[2]/td[2])').replace(
                        '\n', '').replace('\r', '').replace('\t', ' ').strip()
                comAddress = html.xpath(
                    'string(//table[1]//tr[3]/td[2])').replace(
                        '\n', '').replace('\r', '').replace('\t', ' ').strip()
                positionName = html.xpath(
                    'string(//table[2]//tr[2]/td[2])').replace(
                        '\n', '').replace('\r', '').replace('\t', ' ').strip()
                jobType = html.xpath(
                    'string(//table[2]//tr[2]/td[4])').replace(
                        '\n', '').replace('\r', '').replace('\t', ' ').strip()
                zhize = html.xpath('string(//table[2]//tr[9]/td[2])').replace(
                    '\n', '').replace('\r', '').replace('\t', ' ').strip()
                price = html.xpath('string(//table[2]//tr[5]/td[4])').replace(
                    '\n', '').replace('\r', '').replace('\t', ' ').strip()

                save_res = comName + '||' + comAddress + '||' + positionName + '||' + jobType + '||' + zhize + '||' + price + '\n'
                save_res = save_res.replace(',', '，').replace('||', ',')
                print(save_res)
                with open('岗位信息.csv', 'a', encoding='gbk',
                          errors='ignore') as f:
                    f.write(save_res)
            except:
                print('error...' + str(res))
                continue

コード例 #8

0

ファイルを表示

ファイル: TrieData.py プロジェクト: chinaares/project

    def get_trie_data(self, product_url):
        """获取轮胎详情数据"""
        log_init().info(f'{product_url}数据请求中...')
        response = self._parse_url(product_url)
        html = HTML(response.text)

        # 轮胎名称
        title = html.xpath('//*[@id="product_detail"]/div[2]/h1/text()')
        title = [i.strip() for i in title if i.strip()][0]

        properties = html.xpath(
            '//*[@id="product_detail"]/div[2]/div[1]/ul/li')

        # 解析轮胎参数
        reltus = {}
        for propertie in properties:
            TireBrand = propertie.xpath('.//text()')
            reltus[TireBrand[0].replace('：', '')] = TireBrand[1]

        TireBrand = self.is_null(reltus.get('轮胎品牌'))  # 轮胎品牌
        Productspec = self.is_null(reltus.get('产品规格'))  # 产品规格
        Speedlevel = self.is_null(reltus.get('速度级别'))  # 速度级别
        LoadIndex = self.is_null(reltus.get('载重指数'))  # 载重指数
        ProductOrigin = self.is_null(reltus.get('产品产地'))  # 产品产地
        Tyrecategory = self.is_null(reltus.get('轮胎类别'))  # 轮胎类别
        Tirepattern = self.is_null(reltus.get('轮胎花纹'))  # 轮胎花纹
        price = html.xpath(
            '//*[@id="product_detail"]/div[2]/div[2]/div[2]/strong/text()')[
                0]  # 价格

        data = [[
            title, TireBrand, Productspec, Speedlevel, LoadIndex,
            ProductOrigin, Tyrecategory, Tirepattern, price, product_url
        ]]

        log_init().info(f'{product_url}数据获取成功!')
        self.csv_save(data)

コード例 #9

0

ファイルを表示

def deal(html):
    urls = html.xpath(
        '//ol[@class="article-list"]/li//a[@class="anchor article-content-title u-margin-xs-top u-margin-s-bottom"]/@href | //ol[@class="js-jl-aip-list article-list-items"]/li//a[@class="anchor article-content-title u-margin-xs-top u-margin-s-bottom"]/@href'
    )
    for url in urls:
        link = 'https://www.sciencedirect.com' + url
        print(link)
        response = requests.get(link, headers=headers)
        html = HTML(response.text)
        title = html.xpath('string(//h1/span/text())')
        keyword_list = html.xpath(
            '//div[@class="keywords-section"]/div[@class="keyword"]/span/text()'
        )
        # print(keyword_list)
        keywordStr = '，'.join(keyword_list)

        qikanName = html.xpath(
            'string(//h2/a[@class="publication-title-link"]/text())')
        authors_a_list = html.xpath('//div[@id="author-group"]/a')
        authorsList = []
        for span in authors_a_list:
            nameList = span.xpath('.//span[@class="content"]//span/text()')
            nameStr = ' '.join(nameList)
            authorsList.append(nameStr)
        authorsStr = '，'.join(authorsList)
        abstractList = html.xpath(
            '//p[@id="sp0010"]//text()|//p[@id="d1e631"]//text()|//div[@id="abstracts"]/div/div/p//text()'
        )
        abstract = ''.join(abstractList)

        save_res = id + '||' + qikanName + '||' + title + '||' + authorsStr + '||' + abstract + '||' + keywordStr + '||' + link
        save_res = save_res.replace(',', '，').replace('\n', '').replace(
            '||', ',') + '\n'
        print(save_res)
        with open('sciencedirect.csv', 'a', encoding='gbk',
                  errors='ignore') as f:
            f.write(save_res)

コード例 #10

0

ファイルを表示

    def xpathpages(self, resp):
        """规则解析"""

        etre = HTML(resp)

        trlt = etre.xpath('//tr[1]/following-sibling::tr')

        try:
            for _ in trlt:
                item = {}
                item["name"] = "".join(_.xpath('.//td[1]//text()')).strip(" ")
                item["userID"] = "".join(
                    _.xpath('.//td[2]//text()')).strip(" ")
                item["sex"] = "".join(_.xpath('.//td[3]//text()')).strip(" ")
                item["education"] = "".join(
                    _.xpath('.//td[4]//text()')).strip(" ")

                item["_id"] = hashlib.md5(
                    (item["name"] +
                     item["userID"]).encode('utf-8')).hexdigest()
                GD_RC.save(item)
                self.log.info(f"数据{item['_id']}存入mongo")
        except Exception as e:
            print(e)

コード例 #11

0

ファイルを表示

ファイル: get_anhui.py プロジェクト: zack7wong/spiders

def get_canInfo(item, EntType):
    url = 'http://www.aepb.gov.cn:8080/WRYJG/STZXGK/STAuto_Data.aspx?NewsID={id}&zdlx={EntType}'.format(
        id=item['id'], EntType=EntType)
    response = requests.get(url, timeout=80)
    # print(response.text)
    html = HTML(response.text)
    jianceCodeList = html.xpath('//select[@id="DropPk"]/option/@value')
    jianceCodeNameList = html.xpath('//select[@id="DropPk"]/option/text()')

    jianceCodeObjList = []
    for jianceCode, jianceCodeName in zip(jianceCodeList, jianceCodeNameList):
        obj = {
            'jianceCode': jianceCode,
            'jianceCodeName': jianceCodeName,
        }
        jianceCodeObjList.append(obj)

    __VIEWSTATE = re.search('id="__VIEWSTATE" value="(.*?)"',
                            response.text).group(1)
    __VIEWSTATE = quote(__VIEWSTATE).replace('/', '%2F')
    # totalPage = int(re.search('当前第1/(\d+)页',response.text).group(1))
    # print(jianceCodeList)
    # print(__VIEWSTATE)
    return jianceCodeObjList, __VIEWSTATE

コード例 #12

0

ファイルを表示

def wzws_cid_decrypt(text: Union[str, bytes]) -> str:
    """
    :param text: 提示"请开启JavaScript并刷新该页"的响应text
    :return: 重定向url，访问重定向url后会返回wzws_cid的cookie
    """
    base_url = "http://wenshu.court.gov.cn"
    custom_js = """
    window = {};
    document = {
        createElement: () => ({ style: "", appendChild: () => ({}), submit: () => ({}) }),
        body: { appendChild: obj => { window.location = obj.action } }
    };
    atob = str => Buffer.from(str, "base64").toString("binary");
    get_location = () => window.location;
    """

    html = HTML(text)
    js = html.xpath("//script/text()")[0]

    ctx = nodejs.compile(custom_js + js)
    location = ctx.call("get_location")

    redirect_url = parse.urljoin(base_url, location)
    return redirect_url

コード例 #13

0

ファイルを表示

def get_comment(movieId, name):
    for i in range(1, 20):  #从第几页开始
        try:
            print('评论当前页：' + str(i))
            pageToken = i * 20
            start_url = 'https://movie.douban.com/subject/{movieId}/comments?start={pageToken}&limit=20&sort=new_score&status=P'
            url = start_url.format(movieId=movieId, pageToken=pageToken)
            response = requests.get(url, headers=headers)

            html = HTML(response.text)

            alldiv = html.xpath(
                '//div[@id="comments"]/div[@class="comment-item"]')
            # print(allText)

            for div in alldiv:
                commentId = div.xpath('string(./@data-cid)')
                commentInfo = div.xpath(
                    'string(.//span[@class="short"]/text())')
                commentAuthor = div.xpath(
                    'string(.//div[@class="avatar"]/a/@title)')
                commentVote = div.xpath('string(.//span[@class="votes"])')
                commentForMovie = name

                sql = "insert into comments(commentId,commentInfo,commentAuthor,commentVote,commentForMovie)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s')" \
                      % (commentId,commentInfo,commentAuthor,commentVote,commentForMovie) \
                      + "ON DUPLICATE KEY UPDATE commentVote='%s'" % (commentVote)
                print(sql)
                dbCli.save(sql)

            print('暂停10秒')
            time.sleep(10)
        except:
            print('comment,error..')
            continue

コード例 #14

0

ファイルを表示

    def parse_poi_detail(self, response):
        """
        旅游景点解析
        eg:https://place.qyer.com/poi/V2UJYVFkBzJTZVI9/
        """
        html = HTML(response.text)
        item = items.PoiDetailItem()
        item['raw'] = {'html': str(lzma.compress(response.body))}

        item['url'] = response.request.url
        item['id'] = response.request.meta.get('id')
        item['catename'] = response.request.meta.get('catename')
        item['head'] = utils.get_text_by_xpath(html, './/div[@class="qyer_head_crumb"]/span//text()')
        item['title'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="cn"]//text()')
        item['title_en'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="en"]//text()')
        item['rank'] = utils.get_text_by_xpath(html, './/div[@class="infos"]//ul/li[@class="rank"]/span//text()')
        item['poi_detail'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-detail"]//text()')
        item['poi_tips'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/ul[@class="poi-tips"]//text()')
        lis = html.xpath('.//div[@class="compo-detail-info"]/ul[@class="poi-tips"]/li')
        for li in lis:
            title = utils.get_text_by_xpath(li, './/span[@class="title"]/text()')
            content = utils.get_text_by_xpath(li, './/div[@class="content"]//text()')
            if '地址' in title:
                item['address'] = content
            elif '到达方式' in title:
                item['arrive_method'] = content
            elif '开放时间' in title:
                item['open_time'] = content
            elif '门票' in title:
                item['ticket'] = content
            elif '电话' in title:
                item['phone'] = content
            elif '网址' in title:
                item['website'] = content
        item['poi_tip_content'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-tipContent"]//text()')
        yield item

コード例 #15

0

ファイルを表示

ファイル: main.py プロジェクト: StephanoGeorge/Scheduled-Health-Information-Reporting-for-AHAU

    async def func():
        async with contextlib.AsyncExitStack() as stack:
            if not RUN_IMMEDIATELY:
                await sleep(random() * 30 * 60)
            account_id = account['account-id']
            password = account['password']
            context, page = await new_context()
            stack.push_async_callback(context.close)
            stack.push_async_callback(page.close)
            logged = await login(page, account_id, password)
            if not logged:
                return
            html = HTML(await page.content())
            source = await get_script_source(html=html)
            if SCRIPT_SOURCE:
                if diff := '\n'.join(
                        get_diff(SCRIPT_SOURCE, source.splitlines())):
                    await handle_page_changing(diff, source)
                    return

            name = html.xpath("//input[@id='xm']/@value")[0]
            await page.evaluate(js_codes.submit())
            await sleep(2)

            async with page.expect_response('**/tbBcJkxx.zf') as response:
                await page.click("//button[text()='提交']")
                await sleep(5)
            response = await (await response.value).json()
            if response['status'] == 'success':
                await page.wait_for_selector("//div[text()='保存数据成功']",
                                             state='attached')
                LOGGER.warning(f'Success: {account_id} {name}')
            else:
                LOGGER.warning(
                    f'Submit failed: {account_id} {name} <<<{source=}>>>')
                await notify('Submit failed', f'{account_id} {name}')

コード例 #16

0

ファイルを表示

def parse(response):
    html = HTML(response.text)
    div_list = html.xpath('//div[@id="zdlist"]/div[@class="zddiv"]')
    # print(len(div_list))
    for div in div_list:
        link = div.xpath('string(.//div[@class="gsname"]/a/@href)')
        name = re.search('https://www.tianyancha.com/search\?key=(.*?)$', link).group(1)
        # email = div.xpath('string(.//div[@class="other"]/text()[2])').replace('邮箱：','').strip()
        # phone = div.xpath('string(.//div[@class="other"]/text()[3])').replace('电话：','').strip()
        # reg = div.xpath('string(.//div[@class="other"]/text()[4])').replace('注册资本：','').strip()

        deatilText = tostring(div, encoding='utf8').decode('utf8')
        # print(deatilText)

        email = re.search('邮箱：(.*?)<br', deatilText)
        if email:
            email = email.group(1).strip()
        else:
            email = '|'

        phone = re.search('电话：(.*?)<br', deatilText)
        if phone:
            phone = phone.group(1).strip()
        else:
            phone = '|'

        reg = re.search('注册资本：(.*?)注册时间', deatilText)
        if reg:
            reg = reg.group(1).strip()
        else:
            reg = '|'

        save_res = name + '---' + email + '---' + phone + '---' + reg + '\n'
        print(save_res)
        with open('结果.txt','a') as f:
            f.write(save_res)

コード例 #17

0

ファイルを表示

 async def __get_proxies_from_sslproxies(self, session):
     urls = [
         'https://www.sslproxies.org/', 'https://www.us-proxy.org/',
         'https://free-proxy-list.net/',
         'https://free-proxy-list.net/uk-proxy.html',
         'https://free-proxy-list.net/anonymous-proxy.html'
     ]
     idx = 0
     proxies = self.get_https_proxy()
     for url in urls:
         i = 5
         while i > 0:
             await asyncio.sleep(3)
             try:
                 if len(proxies) <= idx:
                     idx = 0
                 res = await session.get(
                     url,
                     proxy='' if len(proxies) == 0 else proxies[idx],
                     timeout=10)
                 html = HTML(await res.text())
                 addresses = html.xpath(
                     '//*[@id="raw"]/div/div/div[2]/textarea/text()'
                 )[0].split('\n')[3:]
                 for adr in addresses:
                     await self.put_proxy('http://' + adr, 'sslproxies')
                 break
             except Exception:
                 i -= 1
                 if idx + 1 > len(proxies):
                     proxies = self.get_https_proxy()
                 idx += 1
                 if (idx >= len(proxies)):
                     idx == 0
                 logger.exception(f"Parse {url} Fail")
         await asyncio.sleep(1)

コード例 #18

0

ファイルを表示

ファイル: law_star.py プロジェクト: ssmnghunssjust/law_star

    def _get_data(self, html):
        html = HTML(html)
        li_list = html.xpath('//ul[@class="list05"]/li')
        data = dict()
        item_list = list()

        # 同步操作
        # for li in li_list:
        #     item = self._parse_detail(li)
        #     item_list.append(item)
        #     time.sleep(3)

        # 协程池异步
        coroutine_list = [
            self.pool.spawn(self._parse_detail, li) for li in li_list
        ]
        gevent.joinall(coroutine_list)
        for coroutine in coroutine_list:
            item_list.append(coroutine.value)

        data['item_list'] = item_list
        data['next_url'] = html.xpath(
            '//form[@name="pageform"]/div/a[@class="xyy"]/@href').pop()
        return data

コード例 #19

0

ファイルを表示

ファイル: mx_download.py プロジェクト: webpatch/Mac-App-Downloader-Alfred-Workflow

def main(wf):
    parse = argparse.ArgumentParser()
    parse.add_argument('--app', dest='app')
    parse.add_argument('query', nargs='*', default=None)
    args = parse.parse_args()
    query = args.query[0]
    log.warn(query)
    if query:
        id = query.rsplit('/', 1)[-1].split('.')[0]
        url = 'http://soft.macx.cn/downloado.do?softid={}&cpus=2&urls=3'.format(
            id)
        r = web.get(url)
        r.raise_for_status()
        a = r.text
        node = HTML(a).find('.//a[@rel="facebox"][last()]')
        log.info(node.text)
        open = ['open']
        if args.app:
            open.extend(['-a', args.app])
        if node is not None and node.text == '浏览器直接下载':
            open.append(node.get('href'))
        else:
            open.append(url)
        call(open)

コード例 #20

0

ファイルを表示

def get_weather(url, weather):
    """
    抓取天气。
    :param url: 网址。
    :param weather: 储存天气信息的对象。
    """
    while True:
        try:
            response = requests.get(
                url,
                headers={
                    'Host':
                    'www.weather.com.cn',
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                    'AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/55.0.2883.87 Safari/537.36'
                },
                timeout=7)
        except Exception as e:
            print('【{}】抓取天气失败，失败信息：{}，尝试重新加载……'.format(
                time.strftime('%Y-%m-%d %H:%M:%S'), e))
            continue
        if 200 == response.status_code:
            break
        print('【{}】抓取天气失败，状态码：{}，尝试重新加载……'.format(
            time.strftime('%Y-%m-%d %H:%M:%S'), response.status_code))
    li = HTML(response.content.decode('utf8')).xpath(
        '//*[@id="today"]/div[1]/ul/li[1]')[0]
    weather['date'] = li.xpath('h1/text()')[0]
    weather['weather'] = li.xpath('p[1]/text()')[0]
    weather['temp'] = li.xpath('p[2]/span/text()')[0].replace('-', '—')
    weather['wind'] = li.xpath('p[3]/span/@title')[0]
    weather['wind_speed'] = li.xpath('p[3]/span/text()')[0].replace('-', '~')
    sun = li.xpath('p[last()]/span/text()')[0]
    weather['sun'] = sun[sun.find(' ') + 1:]

コード例 #21

0

ファイルを表示

    def barcode(self, value, code='Code128', drawOpts=None, htmlAttrs=None):
        """ Generate a <img /> tag with embedded barcode

        Params:
        - value: barcode value, must be valid for barcode type
        - code: barcode type, as per reportlab.graphics.barcode.getCodes()
        - drawOpts: options for the reportlab barcode
        - htmlAttrs: attributes for <img /> tag
        """
        drawOpts = (drawOpts or {})
        imgtype = drawOpts.pop('format', 'png')
        attrs = (htmlAttrs or {})
        drawOpts['value'] = value
        for k in ('width', 'height'):
            # Attempt to unify drawing and image sizes to prevent accidental
            # scaling, and reduce parameter duplication
            if k in drawOpts and k not in attrs:
                attrs[k] = "{0}px".format(drawOpts[k])
            elif k in attrs and k not in drawOpts:
                # reportlab expects a float
                value = str(attrs[k])
                if value.endswith("px"):
                    value = value[:-2].strip()
                try:
                    value = float(value)
                except ValueError:
                    # Ignore values that we can't handle
                    pass
                else:
                    drawOpts[k] = value

        data = createBarcodeDrawing(code, **drawOpts).asString(imgtype)
        attrs['src'] = "data:image/{1};base64,{0}".format(
            data.encode('base64'), imgtype,
        )
        return HTML(Element('img', attrs))

コード例 #22

0

ファイルを表示

ファイル: parse.py プロジェクト: timeofmei/XMUAutoElec

 def getFinalResult(self):
     try:
         self.resp.append(httpx.post(
             self.url, data=self.data, cookies=self.cookies))
     except:
         self.result = self.errmsg
         return None
     self.doc.append(HTML(self.resp[2].text))
     try:
         self.finalResult = self.doc[2].xpath(
             '//*[@id="lableft"]/text()')[0]
         df = self.findPat("账户余额：.*元", 5, -1, float)
         du = self.findPat("剩余电量：.*度", 5, -1, float)
         result = {
             "账户余额": df,
             "剩余电量": du
         }
         return result
     except IndexError:
         result = {
             "账户余额": -1.0,
             "剩余电量": -1.0
         }
         return result

コード例 #23

0

ファイルを表示

ファイル: jinan.py プロジェクト: zack7wong/spiders

def start():
    url = 'http://wx.jd120.com/HqReg-Register.action?code=023Xrq670fPK7F1153970JXj670Xrq6d&state=gh'
    response = requests.get(url)
    # print(response.text)
    html = HTML(response.text)
    urls = html.xpath('//div[@id="appointRegTabContent"]/div/ul/li/a/@href')
    titles = html.xpath('//div[@id="appointRegTabContent"]/div/ul/li/a/text()')
    # print(len(urls))
    # print(len(titles))

    item_list = []
    for url, title in zip(urls, titles):
        link = 'http://wx.jd120.com/' + url
        catName = title.strip()
        # print(link, title)
        obj = {
            'url': link,
            'catName': catName,
        }
        item_list.append(obj)

    # date_list = []
    # for i in range(10):
    #     addTime = i * 3600 * 24
    #     userTime = time.strftime('%Y-%m-%d', time.localtime(time.time() + addTime))
    #     date_list.append(userTime)
    # print(date_list)
    # for date in date_list:
    #     with open(date + '.csv', 'w', encoding='gbk') as f:
    #         pass
    #     print(date)

    allObj_list = []
    for item in item_list:
        url = item['url']
        catName = item['catName']
        print(url, catName)

        try:
            response = requests.get(url, timeout=15)
        except:
            print('请求失败')
            continue
        html = HTML(response.text)
        # print(response.text)
        td_list = html.xpath('//table[@class="table appoint-table"]//tr//td')
        for td in td_list:
            hrefValue_list = td.xpath('.//a/@href')
            # print(hrefValue_list)
            if len(hrefValue_list) >= 1:
                num = 1
                for hrefValue in hrefValue_list:
                    # print(hrefValue)
                    searchRes = re.search(
                        '/HqReg-select_time.action\?workSessionId=.*?&dateId=(.*?)&.*?doctorId=(.*?)$',
                        hrefValue)
                    if searchRes:
                        if searchRes.group(2) != '':
                            dateName = searchRes.group(1)
                            doctorName = td.xpath('string(.//a[' + str(num) +
                                                  ']/span/text())').replace(
                                                      '\n', '').replace(
                                                          '\t', '').replace(
                                                              '\r',
                                                              '').strip()
                            print(dateName, doctorName)
                            obj = {
                                'dateName': dateName,
                                'doctorName': doctorName,
                                'catName': catName,
                            }
                            allObj_list.append(obj)
                    num += 1

    saveFileDate_list = []
    for obj in allObj_list:
        if obj['dateName'] not in saveFileDate_list:
            with open(obj['dateName'] + '.csv', 'w', encoding='gbk') as f:
                save_res = obj['catName'] + ',' + obj['doctorName'] + '\n'
                f.write(save_res)
            saveFileDate_list.append(obj['dateName'])
        else:
            with open(obj['dateName'] + '.csv', 'a', encoding='gbk') as f:
                save_res = obj['catName'] + ',' + obj['doctorName'] + '\n'
                f.write(save_res)

コード例 #24

0

ファイルを表示

    def run(self):

        count = DB.find({"qcc_supplement": 0}).count()
        cookie_count = 0
        while count:

            #初始化driver
            self.borser = self.chrome_driver()
            #最小化窗口
            self.borser.minimize_window()
            #清除缓存
            # self.borser.delete_all_cookies()

            mogodata = DB.find_one({"qcc_supplement": 0})
            # company_key = "佳木斯益隆煤矿机械制造有限公司"
            company_key = mogodata["companyName"]

            self.borser.get(url="https://www.qichacha.com/")
            self.borser.find_element_by_xpath(
                "//*[@id='index']/preceding-sibling::input").send_keys(
                    company_key)
            self.borser.find_element_by_id("V3_Search_bt").click()
            action = ActionChains(self.borser)

            if "您的操作过于频繁，验证后再操作" in self.borser.page_source:
                self.czpf(action)

            elif "法人或股东" not in self.borser.page_source:

                self.smdl(action)

            cookie = self.borser.get_cookies()
            print(cookie)
            cookies = ""
            for items in cookie:
                jioncook = items["name"] + "=" + items["value"] + "; "
                cookies += jioncook
            print(cookies)

            time.sleep(2)
            HTMLTEXT = self.borser.page_source
            etre = HTML(HTMLTEXT)

            info_parmas = etre.xpath(
                '//*[contains(@id,"search-result")]//td[contains(@class,"imgtd")]/following-sibling::*[1]/a/@onclick'
            )  # pc端
            company_infos = etre.xpath(
                '//*[contains(@id,"search-result")]//td[contains(@class,"imgtd")]/following-sibling::*[1]/a/@href'
            )

            self.f.session.headers.update({
                "Cookie": cookies,
            })

            self.proxy = self.get_pros()

            item = mogodata
            for _ in range(len(info_parmas)):

                fol_result = (
                    info_parmas[_].split("addSearchIndex")[1]).replace(
                        '(', '').replace(')', '').replace("'",
                                                          '').replace(";", '')

                if mogodata["companyName"] == fol_result.split(',')[0]:

                    fol_results = fol_result.split(',')
                    company_info_url = company_infos[_]

                    data = {
                        "search_key": fol_results[0],
                        "search_index": fol_results[1],
                        "search_url": '',
                        "company_name": fol_results[2],
                        "type": fol_results[-1],
                    }

                    # 基本信息
                    company_info_urls = self.server_auth(
                        data, company_info_url)
                    html_text = self.company_info_req(company_info_urls)
                    self.company_info_parse(html_text, item, company_info_urls)

            self.borser.close()
            count -= 1

コード例 #25

0

ファイルを表示

def parse_page(url):
    response = fetch(url)
    selector = HTML(response.text)
    href = selector.xpath(
        '//div[@class="list-article list-short"]/ul/li/a/@href')
    return [urljoin(BASE_URL, url) for url in href]

コード例 #26

0

ファイルを表示

ファイル: twitter_selenium.py プロジェクト: zack7wong/spiders

url = 'https://twitter.com/search?f=tweets&vertical=default&q=%23CIIE&src=typd'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=' + '127.0.0.1:1087')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
time.sleep(3)
while True:
    try:
        driver.execute_script("window.scrollBy(0,5000)")
        time.sleep(5)
        # print(driver.page_source)

        html = HTML(driver.page_source)
        results = html.xpath('//div[@class="stream"]/ol/li')
        for res in results:
            try:
                detail_html_text = etree.tostring(res)
                detail_html = HTML(detail_html_text.decode())
                content_list = detail_html.xpath(
                    '//p[@class="TweetTextSize  js-tweet-text tweet-text"]//text()'
                )
                content = ''.join(content_list).replace('\n', '').replace(
                    '\r', '').replace('\t', '').replace(',', '，').strip()
                commentCount = detail_html.xpath(
                    'string(//div[@class="ProfileTweet-action ProfileTweet-action--reply"]//span[@class="ProfileTweet-actionCountForPresentation"]//text())'
                )
                shareCount = detail_html.xpath(
                    'string(//div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]//span[@class="ProfileTweet-actionCountForPresentation"]//text())'

コード例 #27

0

ファイルを表示

ファイル: yingguoweibao.py プロジェクト: pythonPCS/scrapy-redis-mongo-mysql-news

    def parse_item(self, response):
        data_json = json.loads(response.body)
        if 'cards' in data_json.keys():
            for item in data_json['cards']:
                category = response.meta['category']
                title = item['item']['title']
                pic_url = item['item']['displayImages'][0][
                    'urlTemplate'].replace(
                        'w=#{width}&h=#{height}&quality=#{quality}', '')
                describe = item['item']['trailText']
                app_name = '英国卫报'
                try:
                    selector = HTML(item['item']['body'])
                except:
                    return
                content = selector.xpath('//text()')
                content = ''.join(content)
                content = content.replace('\t',
                                          '').replace('\n',
                                                      '').replace('\r', '')
                publishedDate = item['item']['webPublicationDate'].replace(
                    'T', ' ').replace('Z', '')
                author = item['item']['byline']
                crawlTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.localtime(time.time()))
                home_url = response.url
                url = 'https://www.theguardian.com/' + item['item']['id']
                pic_more_url = []
                for pic in item['item']['bodyImages']:
                    pic_more_url.append(pic['urlTemplate'].replace(
                        'w=#{width}&h=#{height}&quality=#{quality}', ''))

                print "app名称", app_name
                print "主图片url", pic_url
                print "子图片url", pic_more_url
                print "作者", author
                print "详情页地址", url
                print "所属类型", category
                print "标题", title
                print "描述", describe
                print "内容", content
                print "主url", home_url
                print "发布时间", publishedDate
                print "爬取时间", crawlTime
                print '\n\n'
                item = NewsItem()
                item['app_name'] = app_name
                item['pic_url'] = pic_url
                item['pic_more_url'] = pic_more_url
                item['author'] = author
                item['url'] = url
                item['category'] = category
                item['title'] = title
                item['describe'] = describe
                item['content'] = content
                item['home_url'] = home_url
                item['publishedDate'] = publishedDate
                item['crawlTime'] = crawlTime
                timeArray = time.strptime(publishedDate, "%Y-%m-%d %H:%M:%S")
                timenum = int(time.mktime(timeArray))
                if timenum >= self.timeStamp:
                    self.count += 1
                    item['count'] = self.count
                    publishedDate = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(float(timenum)))
                    item['publishedDate'] = publishedDate
                    yield item

コード例 #28

0

ファイルを表示

ファイル: _init10_.py プロジェクト: wulg123456/myrepos

def parseData(urlList):
    urlW=open("/usr/product/qujiaozhi/url.txt" ,'a')
    for u in urlList:
        url=u.get("href").strip()
        print url
        urlW.write(url)
        urlW.write("\n")
        h = HTML(getHtml(url).decode('gbk'))
        try:
            dTxt=h.xpath('//h3')
            name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字
            brand=dTxt[0].text.strip().split()[0]#品牌
        except Exception:
            errorTxt.write(url)
#        print brand
#        print name
        try:
            pCpgg=h.xpath('//p[@class="pCpgg"]')
            td=h.xpath('//td[@class="td2"]')  
        except Exception:
            errorTxt.write(url)
        try:
            if td:
                price=list(td[0].itertext())[1].strip()
            else :
                price=list(pCpgg[0].itertext())[1].strip()#价格
#    print price   
        except Exception:
            errorTxt.write(url)
        try:
            norms=list(pCpgg[-1].itertext())[1].strip()#规格
        #    print norms
        except Exception:
            errorTxt.write(url)
        try:
            spePs=h.xpath('//p[@class="speP"]/a')
            effect=''
            for speP in spePs:
                effect+=speP.text.strip()+" "#功效
        #    print effect
        except Exception:
            errorTxt.write(url)
        try:
            awrap=h.xpath('//div[@class="Awrap"]/ul/li/a')
            imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址
        #    print imgUrl
        except Exception:
            errorTxt.write(url)
        try:
            troCon=h.xpath('//div[@class="troCon"]')
            des=list(troCon[0].itertext())
            description=''
            for d in des:
                if len(d.strip())>20:
                    description+=d.strip()+""#产品描述
    #    print description
        except Exception:
            errorTxt.write(url)    
    
        
        try:
            dTxt=h.xpath('//div[@class="dTxt"]/p/a')
            series=dTxt[1].text.strip() #系列
        except Exception:
            errorTxt.write(url) 
        
        insertData(name,brand,price,norms,effect,imgUrl,description,series)

コード例 #29

0

ファイルを表示

ファイル: _init10_.py プロジェクト: wulg123456/myrepos

        db.set_character_set('utf8')
        cursor.execute('SET NAMES utf8;')
        cursor.execute('SET CHARACTER SET utf8;')
        cursor.execute('SET character_set_connection=utf8;')
        cursor.execute(sql)
        db.commit()
    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])
    cursor.close()  
    db.close()    
#urlHtml=getHtml("http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm12_ef0_pb0_pe0_or0.html")
for i in range(58,59):
    i=str(i)
    print i
    htmls="http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm12_ef0_pb0_pe0_or0_p"+i+".html#productList"
    urlHtml=getHtml(htmls)
    html= HTML(urlHtml.decode('gbk'))
    urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
    try:
        html= HTML(urlHtml.decode('gbk'))
        urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
        parseData(urlList) 
    except Exception :
        errorTxt.write("\n")
        errorTxt.write(i)
        errorTxt.write("\n")
        continue
#html= HTML(urlHtml.decode('gbk'))
#urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
#parseData(urlList)

コード例 #30

0

ファイルを表示

ファイル: weibo.py プロジェクト: zack7wong/spiders

async def start():
    writeList = [
        '时间', '榜单类型', '姓名', '总分数', '阅读人数', '阅读人数得分', '阅读人数排名', '互动数', '互动数得分',
        '互动数排名', '社会影响力', '社会影响力得分', '社会影响力排名', '爱慕值', '爱慕值得分', '爱慕值排名', '正能量',
        '正能量得分', '正能量排名', '搜索量', '搜索量得分', '搜索量排名', '提及量', '提及量得分', '提及量排名',
        '阅读数', '阅读数得分', '阅读数排名'
    ]
    # with open('微博数据.csv', 'w', encoding='gbk') as f:
    #     f.write('时间,榜单类型,姓名,总分数,阅读人数,阅读人数得分,阅读人数排名,互动数,互动数得分,互动数排名,社会影响力,社会影响力得分,社会影响力排名,爱慕值,爱慕值得分,爱慕值排名,正能量,正能量得分,正能量排名,搜索量,搜索量得分,搜索量排名,提及量,提及量得分,提及量排名,阅读数,阅读数得分,阅读数排名\n')

    date_list = get_date()
    print(date_list)
    url = "http://chart.weibo.com/aj/ranklist"
    rank_type_list = ['5', '3', '6']
    item_list = []
    for rank_type in rank_type_list:
        for dateObj in date_list:
            date = dateObj['date']
            period = dateObj['period']
            for pageToken in range(1, 5):
                # payload = "time_type={date}&rank_type={rank_type}&version=v1&_t=0"
                payload = "datatype=&page={pageToken}&pagesize=25&rank_type={rank_type}&time_type={date}&period={period}&version=v1&_t=0"
                data = payload.format(pageToken=pageToken,
                                      date=date,
                                      rank_type=rank_type,
                                      period=period)
                # data = 'date=2019%2f1%2f1&type=realTimeHotSearchList'
                print(data)
                try:
                    response = requests.request("POST",
                                                url,
                                                data=data,
                                                headers=headers,
                                                verify=False)
                    print(response.text)
                    json_obj = json.loads(response.text)
                    html = HTML(json_obj['data'])
                except:
                    print('errors..' + date + ',' + str(pageToken) + ',' +
                          rank_type + '\n')
                    with open('errors.txt', 'a') as f:
                        f.write('errors..' + date + ',' + str(pageToken) +
                                ',' + rank_type + '\n')
                    continue

                div_list = html.xpath(
                    '//div[@class="sr_ranking_type clearfix"]')
                for div in div_list:
                    if rank_type == '5':
                        bangdanType = '内地榜'
                    elif rank_type == '3':
                        bangdanType = '港澳台榜'
                    else:
                        bangdanType = '新星榜'
                    name = div.xpath(
                        'string(.//div[@class="sr_name S_func1"]/a/text())'
                    ).strip()
                    zongfenshu = div.xpath(
                        'string(.//div[@class="sr_text W_f16"]/span/b/text())')

                    len_li = div.xpath('.//ul/li')

                    item = {}
                    item['时间'] = date
                    item['榜单类型'] = bangdanType
                    item['姓名'] = name
                    item['总分数'] = zongfenshu

                    for liNum in range(1, len(len_li) + 1):
                        spanName = div.xpath(
                            'string(.//ul/li[' + str(liNum) +
                            ']//div[@class="propor sr_fl"]/span[@class="pro_txt"]/text())'
                        ).replace('：', '').strip()
                        if spanName == '互动量':
                            spanName = '互动数'

                        spanNameValue = div.xpath(
                            'string(.//ul/li[' + str(liNum) +
                            ']//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())'
                        )
                        spanNamedefen = div.xpath(
                            'string(.//ul/li[' + str(liNum) +
                            ']//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())'
                        )
                        spanNamepaiming = div.xpath(
                            'string(.//ul/li[' + str(liNum) +
                            ']//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())'
                        )

                        # print(spanName)
                        spanNamedefenName = spanName + '得分'
                        spanNamepaimingName = spanName + '排名'
                        item[spanName] = spanNameValue
                        item[spanNamedefenName] = spanNamedefen
                        item[spanNamepaimingName] = spanNamepaiming
                        # print(item)

                    for key in writeList:
                        if key not in item.keys():
                            item[key] = ''
                    item_list.append(item)
                    print(item)

                    # yuedurenshu = div.xpath('string(.//ul/li[1]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())')
                    #
                    # hudongshu = div.xpath('string(.//ul/li[2]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())')
                    # hudongshudefen = div.xpath('string(.//ul/li[2]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())')
                    # hudongshupaiming = div.xpath('string(.//ul/li[2]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())')
                    #
                    # shehui = div.xpath('string(.//ul/li[3]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())')
                    # shehuidefen = div.xpath('string(.//ul/li[3]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())')
                    # shehuipaiming = div.xpath('string(.//ul/li[3]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())')
                    #
                    # aiamu = div.xpath('string(.//ul/li[4]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())')
                    # aiamudefen = div.xpath('string(.//ul/li[4]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())')
                    # aiamupaiming = div.xpath('string(.//ul/li[4]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())')
                    #
                    # zhengnengliang = div.xpath('string(.//ul/li[5]//div[@class="propor sr_fl"]/span[@class="pro_num"]/text())')
                    # zhengnengliangdefen = div.xpath('string(.//ul/li[5]//div[@class="civi score sr_fl"]/span/i[@class="ci_num"]/text())')
                    # zhengnengliangpaiming = div.xpath('string(.//ul/li[5]//div[@class="civi sr_fl"]/span/i[@class="ci_num"]/text())')
                    #
                    # save_res = date+','+bangdanType+','+name+','+zongfenshu+','+yuedurenshu+','+yuedurenshudefen+','+yuedurenshupaiming+','+hudongshu+','+hudongshudefen+','+hudongshupaiming+','+shehui+','+shehuidefen+','+shehuipaiming+','+aiamu+','+aiamudefen+','+aiamupaiming+','+zhengnengliang+','+zhengnengliangdefen+','+zhengnengliangpaiming+'\n'
                    # print(save_res)
                    # with open('微博数据.csv','a', encoding='gbk',errors='ignore') as f:
                    #     f.write(save_res)
                time.sleep(5)

    mongo_config = WriterConfig.WXLSXConfig('结果.xlsx', headers=writeList)
    with ProcessFactory.create_writer(mongo_config) as mongo_writer:
        mongo_writer.write(item_list)