Ejemplos de HTML en Python, ejemplos de lxml.html.etree.HTML en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: 天气.py Proyecto: GZQ369/cityweather

    def parse(city_url):  # 解析函数
        response = requests.get(city_url)
        response.encoding = 'utf-8'
        html = etree.HTML(response.text)
        current_city = html.xpath("//div[@class='search_default']/em/text()")[
            0]  #    下面都是利用xpath解析的
        print('当前城市：' + current_city)
        current_kongqi = html.xpath(
            "//div[@class='left']/div[@class='wea_alert clearfix']/ul/li/a/em/text()"
        )[0]
        print('空气质量：' + current_kongqi)
        current_wendu = html.xpath(
            "//div[@class='left']/div[@class='wea_weather clearfix']/em/text()"
        )[0]
        print('当前温度：' + current_wendu + '℃')
        current_weather = html.xpath(
            "//div[@class='wea_weather clearfix']/b/text()")[0]
        print('天气状况：' + current_weather)
        current_shidu = html.xpath(
            "//div[@class='left']/div[@class='wea_about clearfix']/span/text()"
        )[0]
        print('当前湿度：' + current_shidu)
        current_fengji = html.xpath(
            "//div[@class='left']/div[@class='wea_about clearfix']/em/text()"
        )[0]
        print('当前风速：' + current_fengji)
        jingdian = html.xpath(
            "//div[@class='right']/div[@class='near'][2]/div[@class='item clearfix']/ul/li/a/text()"
        )
        print('附近景点：')
        for j in jingdian:
            print('\t\t' + j)

    #     return current_city,current_kongqi,current_wendu,current_weather,current_shidu,current_fengji

    # def rewrite(city,kongqi,wendu,weather,shidu,fengji):
    #     # if not os.path.exists(weather.csv):     #判断当前路径是否存在，没有则创建new文件夹

    #     # else:
    # with open('te8578978888888888st.txt', 'w') as f:
    #     f.write('hello, python')
    #     print("数据写入完成")
        with open(path + "\\" + city_name + 'weather.csv', 'a') as f:
            # f.write(current_city ,current_shidu)
            data1 = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
            time1 = time.strftime('%H:%M:%S', time.localtime(time.time()))
            writer = csv.writer(f)
            #先写入columns_name
            #writer.writerow(["index","csv_1","csv_2"])
            #写入多行用writerows
            try:
                #f.write('hello, python')
                writer.writerows([[
                    data1, current_city[:3], current_wendu, current_shidu,
                    current_weather, current_fengji
                ]])
                print("csv数据写入完成\n{},{}{}{}{}{}{}".format(
                    data1, time1, current_city[:3], current_wendu,
                    current_shidu, current_weather, current_fengji))
                print('等待正在运行....')
            except:
                print('数据保存错误')
                #writer.writerows([[data1,time1,current_city[:3],current_wendu,current_shidu,current_weather,current_fengji]])
            f.close()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: img_file_spiderParseNewsV2.0.py Proyecto: yyjqr/Eangel

def parseHtml(file):
    print("解析HTML")
    html = etree.HTML(file)
    tag3 = html.xpath('/html/tr/td[1]/text()')
    print(tag3)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: utils.py Proyecto: q8mail/freenom-dns

def parse_html(html, xpath):
    element = etree.HTML(html)
    return element.xpath(xpath)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: codes.py Proyecto: Pengguancheng/analyze

import urllib3
import csv
import requests
from collections import namedtuple
from lxml.html import etree

TWSE_EQUITIES_URL = 'http://isin.twse.com.tw/isin/C_public.jsp?strMode=2'
TPEX_EQUITIES_URL = 'http://isin.twse.com.tw/isin/C_public.jsp?strMode=4'
ROW = namedtuple(
    'Row', ['type', 'code', 'name', 'ISIN', 'start', 'market', 'group', 'CFI'])


def fetch_data(url):
    r = requests.get(url)
    return r


if __name__ == '__main__':
    data = fetch_data(TWSE_EQUITIES_URL)
    root = etree.HTML(data.text)
    trs = root.xpath('//tr')[1:]

Ejemplo n.º 5

0

Mostrar archivo

Archivo: FTXSpider.py Proyecto: sumsai/Crawl-Project

    def parse(self, current_city_url, html, city_name):
        file_name = f'租房{self.today_str}/{city_name}{self.today_str}房天下租房.xlsx'
        if not os.path.exists(file_name):
            wb = openpyxl.Workbook()
            ws = wb.worksheets[0]
            self.save_to_excel(ws, 0, self.excel_head)
            wb.save(file_name)
        wb = openpyxl.load_workbook(file_name)
        ws = wb.worksheets[0]
        next_url = True
        row_count = 1
        while next_url:
            html_eles = etree.HTML(html)
            # 获取下一页
            next_url = html_eles.xpath('//a[text()="下一页"]/@href')
            next_url = current_city_url + next_url[0][1:] if next_url else None
            # 获取网页houseList类所有租房信息
            house_eles = html_eles.xpath('//div[@class="houseList"]/dl')
            # 遍历每个房子获取租房信息
            for house_ele in house_eles:
                # 获取房子id（用于去重）
                house_id = house_ele.xpath('./dd/p[1]/a/@href')
                if house_id:  # 图片数量不存在说明，是广告，不做处理
                    try:
                        house_id = house_id[0].split('/')[-1].split('.')[
                            0]  # 简化id
                        # 图片数量
                        tupian = house_ele.xpath(
                            './/span[@class="iconImg"]/text()')[0]
                        # 价格
                        price = house_ele.xpath(
                            './/span[@class="price"]/text()')[0]
                        # renttype/shiting/mianji/chaoxiang
                        main_info = [
                            re.sub('\r|\n| |', '', field).replace('�O', '㎡')
                            for field in house_ele.xpath('./dd/p[2]//text()')
                            if field != '|'
                        ]
                        if len(main_info) != 4:
                            continue
                        renttype = main_info[0]
                        shiting = main_info[1]
                        mianji = main_info[2]
                        chaoxiang = main_info[3]
                        # 辖区、 街道、小区名
                        position_info = [
                            field for field in house_ele.xpath(
                                './dd/p[3]/a/span/text()')
                        ]
                        if len(position_info) != 3:
                            continue
                        xiaqu = position_info[0]
                        jiedao = position_info[1]
                        xiaoqu = position_info[2]
                        jiaotong = ''.join(
                            house_ele.xpath(
                                './/span[@class="note subInfor"]//text()'))
                        jiaotong = jiaotong if jiaotong else '无'
                    except:
                        pass
                    else:
                        if row_count > 3000:
                            wb.save(file_name)
                            return
                        if house_id not in self.quchong[city_name]:
                            # print(house_id, tupian, price, renttype, shiting, mianji, chaoxiang, xiaqu, jiedao, xiaoqu, jiaotong)
                            print(f'正在爬取:{city_name}-->第{row_count}条租房信息', )
                            # 保存数据

                            self.save_to_excel(ws, row_count, [
                                self.today_str,
                                city_name,
                                tupian,
                                price,
                                renttype,
                                shiting,
                                mianji,
                                chaoxiang,
                                xiaqu,
                                jiedao,
                                xiaoqu,
                                jiaotong,
                            ])
                            row_count += 1
                            self.quchong[city_name].append(
                                house_id)  # 将爬取过的房子id放进去，用于去重
                        else:
                            print('已存在')
            if next_url:
                html = self.get_html(next_url)
        wb.save(file_name)

Ejemplo n.º 6

0

Mostrar archivo

def query_title(lession_id):
    url = "https://edu.51cto.com/center/course/lesson/index?type=wejob&id=" + lession_id
    html = etree.HTML(requests.get(url, headers=headers).text)
    title = html.xpath('//title')[0].text.replace(" ", "-")[:-8]
    return title

Ejemplo n.º 7

0

Mostrar archivo

def get_parse(url):
    content = requests.get(url, headers=useragentutil.get_headers()).content
    parse = etree.HTML(content)
    return parse

Ejemplo n.º 8

0

Mostrar archivo

def url_to_md_txt_hexo(url, tap):
    try:
        url = f'https://www.cnblogs.com/{url}'
        response = requests.get(url)
        # print(response.text)
        a = re.findall(
            '<div id="cnblogs_post_body" class="blogpost-body.*?">(.*?)<div id="MySignature"></div>',
            response.text, re.S)
        if not a:
            response_dome = BeautifulSoup(response.text, 'html.parser')
            response_dome_str = str(response_dome.div)
            a = re.findall(
                '<div class="postBody">(.*?)<div id="MySignature"></div>',
                response_dome_str, re.S)
        a = a[0]

        #去除a头尾的空格
        a = a.strip()

        #去除末尾的div
        a = a[:-6]

        #再去除一次宫格
        a = a.strip()

        # 标题
        a = re.sub('<h1>.*?\d*\. (?P<name>.*?)</h1>', '<h1>\g<name>\n\n</h1>',
                   a)
        a = re.sub('<h1.*?>', '# ', a)
        a = re.sub('<h2>.*?\d*\.\d* (?P<name>.*?)</h2>',
                   '<h2>\g<name>\n\n</h2>', a)
        a = re.sub('<h2.*?>', '## ', a)
        a = re.sub('<h3>.*?\d*\.\d*\.\d* (?P<name>.*?)</h3>',
                   '<h3>\g<name>\n\n</h3>', a)
        a = re.sub('<h3.*?>', '### ', a)
        a = re.sub('<h4>.*?\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h4>',
                   '<h4>\g<name>\n\n</h4>', a)
        a = re.sub('<h4.*?>', '#### ', a)
        a = re.sub('<h5>.*?\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h5>',
                   '<h5>\g<name>\n\n</h5>', a)
        a = re.sub('<h5.*?>', '##### ', a)
        a = re.sub('<h6>.*?\d*\.\d*\.\d*\.\d*\.\d*\.\d* (?P<name>.*?)</h6>',
                   '<h6>\g<name>\n\n</h6>', a)
        a = re.sub('<h6.*?>', '###### ', a)
        a = re.sub('</h1>|</h2>|</h3>|</h4>|</h5>|</h6>|', "", a)
        # print(a)

        # 三个点
        if '<pre class=' in a:
            a = re.sub('<pre class="', '```', a)
            a = re.sub('"><code>', '\n', a)
        a = re.sub('<pre><code.*?>', '```\n', a)
        a = re.sub('</code></pre>', '\n```', a)

        #另外一个写法的a
        a = re.sub('<div class="cnblogs_code".*?>', '```python', a)
        a = re.sub('</div>', '```', a)

        #一个点
        a = re.sub('<code.*?>|</code>', '`', a)

        # 标签
        # 去掉开头的div标签
        a = re.sub('<div.*?>', '', a)

        # em标签
        a = re.sub('<em.*?>|</em>', ' ', a)

        # strong标签加粗
        a = re.sub('<strong>|</strong>', '**', a)

        # span标签
        a = re.sub('<span.*?>|</span>', '', a)

        # pre标签
        a = re.sub('<pre.*?>|</pre>', '', a)

        # p标签
        a = re.sub('<p.*?>|</p>', '', a)

        # br标签
        a = re.sub('<br/>', '\n', a)

        # 里面内容特殊变化
        # 双引号
        a = re.sub('&quot;', '"', a)
        # 单引号
        a = re.sub('&#39;', "'", a)
        # >符号
        a = re.sub('&gt', '>', a)
        # 符号
        a = re.sub('&lt', '<', a)

        #ul与li
        a = re.sub('<ul.*?>|</ul>|</li>', '', a)
        a = re.sub('<li.*?>', '- ', a)

        #html标签修正
        print(a)
        a = re.sub('<;', '<', a)
        a = re.sub('>;', '>', a)
        a = re.sub(';/', '/', a)

        # 上面全是转md

        # 上面全是转md
        #添加头
        title_xpath = '//a[@id="cb_post_title_url"]/text()'
        response_html = etree.HTML(response.text)
        title = response_html.xpath(title_xpath)[0]
        data_xpath = '//*[@id="post-date"]/text()'
        data = response_html.xpath(data_xpath)[0]
        data_header = f'---\ntitle: {title} \ndate: {data} \ntags: {tap} \n\n\n---\n'
        a = data_header + a
        return a

    # 可能博客不一样会存在见状性没有用我匹配的格式找到内容
    except:
        print('on')
        return False

Ejemplo n.º 9

0

Mostrar archivo

Archivo: proxyTest.py Proyecto: independenter/scrapyBT

    # {"http": "111.155.116.220:8123"},
    # {"https": "58.19.63.57:18118" },
    # {"https": "183.159.85.234:3128" },
    # {"https": "223.240.208.151:18118"},
    # {"http": "117.68.193.19:18118" },
    ]
httpCol={}
# 有效代理IP池
proxypool = []
#http://www.66ip.cn/
proxy_urls = ['http://www.ip3366.net/?stype=1&page={}'.format(n) for n in range(1,11)]
for proxy_url in proxy_urls:
    #print(proxy_url)
    r = requests.get(url=proxy_url)
    if r.status_code==200:
        html = etree.HTML(r.text)
        selectors = html.xpath('//*[@id="list"]/table/tbody/tr')
        #print(selectors)
        for row in selectors:
            host = row.xpath('td[1]/text()')[0]
            port = row.xpath('td[2]/text()')[0]
            httpCol['http']=host+":"+port
            proxies.append(httpCol)
            httpCol={}
    r.close()
#print(proxies)


f = open("EffectiveIp.json", 'w')
f.write('[')

Ejemplo n.º 10

0

Mostrar archivo

Archivo: l13_like_share_tag.py Proyecto: IshtarTang/lofterSpider

def make_data(mode, url=""):
    """
    :param mode: 模式，支持的模式有share like1 like2 tag
    :param url:  生成data需要用到url，share like1 需要的是用户主页的url，tag需要的是tag页的url。like2不会用到，因为信息在cookies种
    :return: 初始data
    """
    if (mode == "like1" or mode == "share" or mode == "tag") and url == "":
        print("{}模式生成data需要url参数".format(mode))
        return {}

    base_data = {'callCount': '1',
                 'httpSessionId': '',
                 'scriptSessionId': '${scriptSessionId}187',
                 'c0-id': '0',
                 "batchId": "472351"}
    get_num = 100
    got_num = 0
    if mode == "share" or mode == "like1":
        userId = ""
        user_page_parse = etree.HTML(
            requests.get(url, headers=useragentutil.get_headers()).content.decode("utf-8"))
        try:
            userId = user_page_parse.xpath("//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1]
        except:
            print("\n链接与模式不匹配")
            exit()
        data_parme = {
            'c0-scriptName': 'BlogBean',
            "c0-methodName": "",
            'c0-param0': 'number:' + str(userId),
            'c0-param1': 'number:' + str(get_num),
            'c0-param2': 'number:' + str(got_num),
            'c0-param3': 'string:'}
        if mode == "like1":
            data_parme["c0-methodName"] = "queryLikePosts"
        else:
            data_parme["c0-methodName"] = "querySharePosts"

    elif mode == "like2":
        data_parme = {"c0-scriptName": "PostBean",
                      "c0-methodName": "getFavTrackItem",
                      "c0-param0": "number:" + str(get_num),
                      "c0-param1": "number:" + str(got_num),
                      }
    elif mode == "tag":
        # 参数8要拿时间戳
        url_search = re.search("http[s]{0,1}://www.lofter.com/tag/(.*?)/(.*)", url)
        type = url_search.group(2)
        if type == "":
            type = "new"
        data_parme = {'c0-scriptName': 'TagBean',
                      'c0-methodName': 'search',
                      'c0-param0': 'string:' + url_search.group(1),
                      'c0-param1': 'number:0',
                      'c0-param2': 'string:',
                      'c0-param3': 'string:' + type,
                      'c0-param4': 'boolean:false',
                      'c0-param5': 'number:0',
                      'c0-param6': 'number:' + str(get_num),
                      'c0-param7': 'number:' + str(got_num),
                      'c0-param8': 'number:' + str(int(time.time() * 1000)),
                      'batchId': '870178'}
    else:
        print("data-模式错误")
        data_parme = {}
    data = {**base_data, **data_parme}
    return data

Ejemplo n.º 11

0

Mostrar archivo

Archivo: extract.py Proyecto: metanoia1989/PythonStudy

def fetch_questions(url, chapter_id):
    """
    根据知识点来提取题目
    :url string 知识点url
    :chapter_id 知识点ID
    """
    print("题目提取开始：{0} {1}".format(chapter_id, url))
    # 只提取 https://m10.bjzjxf.com/Home/Index/qaq/1985 这一类的题目
    if url.find("qaq") == -1:
        print("非题目页面，跳过")
        return

    if redis.exists(PROCESSED_URLS) and redis.sismember(PROCESSED_URLS, url):
        print("题目已处理，跳过")
        return

    res = http_request(url)
    html = etree.HTML(res)

    # 正则匹配题目数量
    try:
        number = html.xpath("//div[@id='1']/text()")[0]
        number = int(re.findall(r"共(\d+)题", number)[0])
    except IndexError:
        redis.sadd(PROCESSED_URLS, url)
        print("此知识点没有题目")
        return

    items = []

    qhtml = html.xpath("//div[@class='dati']")[0]
    contents = qhtml.xpath("./b/text()")  # 题目正文
    selects = qhtml.xpath("./ul")  # 题目选项
    answers = qhtml.xpath("./div[@class='answer']")  # 答案选项

    if len(selects) != number or len(contents) != number or len(
            answers) != number:
        redis.sadd(PROCESSED_URLS, url)
        print("此知识点内容有误，请手动处理")
        redis.sadd("need_handle_urls", url)
        return

    for i in range(0, number):
        # 提取选项
        select_list = selects[i].xpath("./li/text()")
        select_list = split_array(select_list, 2)
        select_list = [
            " ".join(x).replace("\u2003\u2002", "") for x in select_list
        ]

        # 提取答案
        answer_text = list(
            filter(lambda x: x != "您选择:", answers[i].xpath(".//text()")))

        item = {
            "title": qhtml.xpath("./div[@id={0}]/text()".format(i + 1)),
            "select": "\n".join(select_list),
            "content": contents[i],
            "answer":
            answer_text[0] + answer_text[1] + "\n".join(answer_text[2:]),
            "order": i + 1,
        }
        items.append(item)

    # 插入题目
    for item in items:
        if isinstance(item["title"], list):
            item["title"] = item["title"].pop()
        row = db.select_one(
            "SELECT * FROM `tk_questions` WHERE `chapter_id`=%s and `title`=%s",
            (chapter_id, item["title"]))
        if row is not None:
            print("{0} 题目已存在，无法插入！".format(item["title"]))
            continue
        sql = """
            INSERT INTO `tk_questions` ( `chapter_id`, `title`, `content`, `select`, `answer`, `order`) 
            VALUES ( %s, %s, %s, %s, %s, %s)
        """
        data = (chapter_id, item["title"], item["content"], item["select"],
                item["answer"], item["order"])
        db.insert(sql, data)

    redis.sadd(PROCESSED_URLS, url)
    print("此知识点题目提取完毕")

Ejemplo n.º 12

0

Mostrar archivo

Archivo: l13_like_share_tag.py Proyecto: IshtarTang/lofterSpider

def infor_formater(favs_info, fav_str, mode, file_path, start_time, min_hot, print_level):
    # 把字段从原文件中提取出来，大部分使用正则

    format_fav_info = []
    start_time_stamp = ""
    if start_time:
        start_time_stamp = time.mktime(time.strptime(start_time, "%Y-%m-%d"))

    for fav_info in favs_info:
        blog_info = {}
        # 博客链接
        try:
            url = re.search('s\d{1,5}.blogPageUrl="(.*?)"', fav_info).group(1)
        except:
            print("博客{} 信息丢失，跳过".format(favs_info.index(fav_info) + 1))
            continue
        blog_info["url"] = url
        if print_level:
            print("博客{} {}准备解析".format(favs_info.index(fav_info) + 1, url), end="\t")

        # 喜欢时间
        fav_timestamp = re.search('s\d{1,5}.opTime=(.*?);', fav_info).group(1)
        # 模式为like2且早于设定时间则跳出整理
        if mode == "like2" and start_time:
            if int(fav_timestamp) / 1000 < start_time_stamp:
                print("已将指定时间内的博客解析结束")
                break
        blog_hot = int(re.search('s\d{1,5}.hot=(.*?);', fav_info).group(1))
        if mode == "tag" and blog_hot < min_hot:
            print("当前博客的热度小于设定热度，跳过")
            continue
        time_local2 = time.localtime(int(int(fav_timestamp) / 1000))
        fav_time = time.strftime("%Y-%m-%d", time_local2)
        blog_info["fav time"] = fav_time

        # 作者名
        author_name_search = re.search('s\d{1,5}.blogNickName="(.*?)"', fav_info)

        if author_name_search:
            author_name = author_name_search.group(1).encode('latin-1').decode('unicode_escape', errors="replace")
        # 正则没有匹配出来的话说明这一页的前面也有这个作者的博客，作者信息在前面，找到id再在前面搜索作者信息
        else:
            info_id = re.search("s\d{1,5}.blogInfo=(s\d{1,5})", fav_info).group(1)
            test_names = re.findall(info_id + '.blogNickName="(.*?)"', fav_str.split('blogPageUrl="' + url + '"')[0])
            author_name = test_names[-1].encode('latin-1').decode('unicode_escape', errors="replace")
        blog_info["author name"] = author_name

        # 文件中不允许出现的字符，在用于文件名时要全部替换掉，英文括号换成中文括号，避免在检查文件名重复时被切割
        author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\r", " ").replace(
            "\\", "&").replace("<", "《").replace(">", "》").replace(":", "：").replace('"', '”').replace("?", "？") \
            .replace("*", "·").replace("\n", "").replace("(", "（").replace(")", "）").strip()
        blog_info["author name in filename"] = author_name_in_filename
        # 作者ip
        author_ip = re.search("http[s]{0,1}://(.*?).lofter.com", url).group(1)
        blog_info["author ip"] = author_ip
        # 发表时间
        public_timestamp = re.search('s\d{1,5}.publishTime=(.*?);', fav_info).group(1)
        time_local1 = time.localtime(int(int(public_timestamp) / 1000))
        public_time = time.strftime("%Y-%m-%d", time_local1)
        blog_info["public time"] = public_time
        # tags
        tags = re.search('s\d{1,5}.tag[s]{0,1}="(.*?)";', fav_info).group(1).strip().encode('utf-8').decode(
            'unicode_escape').split(",")
        if tags[0] == "":
            tags = []
        lower_tags = []
        for tag in tags:
            # 转小写，全角空格转半角
            lower_tag = tag.lower().replace(" ", " ").strip()
            lower_tags.append(lower_tag)
        blog_info["tags"] = lower_tags
        # 标题
        try:
            title = re.search('s\d{1,5}.title="(.*?)"', fav_info).group(1).encode('latin-1').decode('unicode_escape',
                                                                                                    errors="ignore ")
        except:
            title = ""
        title_in_filename = title.replace("/", "&").replace("|", "&").replace("\r", " ").replace(
            "\\", "&") \
            .replace("<", "《").replace(">", "》").replace(":", "：").replace('"', '”').replace("?", "？") \
            .replace("*", "·").replace("\n", "").replace("(", "（").replace(")", "）").strip()
        blog_info["title"] = title
        blog_info["title in filename"] = title_in_filename

        # 图片链接
        img_urls = []
        urls_search = re.search('originPhotoLinks="(\[.*?\])"', fav_info)
        if urls_search:
            urls_str = urls_search.group(1).replace("\\", "").replace("false", "False").replace("true", "True")
            urls_infos = eval(urls_str)
            for url_info in urls_infos:
                # raw是没有任何后缀的原图，但有的没有raw，取orign
                try:
                    url = url_info["raw"]
                except:
                    url = url_info["orign"].split("?imageView")[0]
                if "netease" in url:
                    url = url_info["orign"].split("?imageView")[0]
                img_urls.append(url)
        blog_info["img urls"] = img_urls

        # 正文内容
        tmp_content1 = re.search('s\d{1,5}.content="(.*?)";', fav_info).group(1)
        parse = etree.HTML(tmp_content1)
        # if tmp_content1:
        #     f = parse.xpath("//p//text()")
        #     tmp_content2 = "\n".join(f)
        #     content = tmp_content2.encode('latin-1').decode("unicode_escape", errors="ignore").strip()
        # else:
        #     content = ""
        # blog_info["content"] = content
        content = html2text.html2text(tmp_content1.encode('latin-1').decode("unicode_escape", errors="ignore"))
        blog_info["content"] = content

        # 文章中插的图片
        illustration = []
        if tmp_content1:
            # 匹配新格式
            img_src = parse.xpath("//img/@src")
            illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)\?', "\n".join(img_src))
            if illustration == []:
                # 匹配旧格式
                illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)\?',
                                          "\n".join(img_src))

        blog_info["illustration"] = illustration

        # 外链
        if tmp_content1:
            link_a = parse.xpath("//a/@href")
            external_link = list(map(lambda x: x.replace("\\", "").replace('"', ''), link_a))
        else:
            external_link = []
        blog_info["external link"] = external_link

        # 长文章
        l_content = ""
        l_cover = ""
        l_url = []
        l_img = []
        long_article = re.search('s\d{1,5}.compositeContent="(.*?)";s\d{1,5}', fav_info)
        try:
            if long_article:
                long_article1 = long_article.group(1)
                parse = etree.HTML(long_article.group(1))
                l_cover = re.search('s\d{1,5}.banner="(.*?)";', fav_info).group(1)
                l_url = parse.xpath("//a//@href")
                l_url = list(map(lambda x: x.replace("\\", "").replace('"', ''), l_url))
                l_img = parse.xpath("//img/@src")
                l_img = list(map(lambda x: x.replace("\\", "").replace('"', ''), l_img))
                l_content = c = re.sub('<[^<]+?>', '', long_article1).replace("&nbsp;", " ").strip()
                l_content = l_content.encode('latin-1').decode("unicode_escape", errors="ignore").strip()
        except:
            # print("长文章 {} 被屏蔽，无法获取正文".format(url))
            pass
        blog_info["long article content"] = l_content
        blog_info["long article url"] = l_url
        blog_info["long article img"] = l_img
        blog_info["long article cover"] = l_cover

        # video_url_search = re.findall('"originUrl":""')

        # 整合后输出
        format_fav_info.append(blog_info)
        if print_level:
            print("解析完成，具体信息：\n{}".format(blog_info))
            print("----" * 20)
        else:
            if favs_info.index(fav_info) % 100 == 0 or len(format_fav_info) == len(favs_info):
                print("解析进度 {}/{}   正在解析的博客链接 {}".format(len(format_fav_info), len(favs_info), blog_info["url"]))
    # 写入到文件
    with open(file_path + "/format_blogs_info.json", "w", encoding="utf-8", errors="ignore") as op:
        op.write(json.dumps(format_fav_info, ensure_ascii=False, indent=4))

Ejemplo n.º 13

0

Mostrar archivo

    def parse(self, url):
        ret = requests.get(url, headers=self.headers)
        html = etree.HTML(ret.content.decode())
        page_count = html.xpath(
            '//*[@id="J_bottomPage"]/span[2]/em[1]/b/text()')[0]
        count = int(page_count) + 1
        for i in range(1, count):
            print("开始爬取数据")
            data_url = url + '&page=' + str(i)
            data = requests.get(data_url, headers=self.headers)
            html = etree.HTML(data.content.decode())
            item_lis = html.xpath('//*[@id="plist"]/ul/li')
            data_list = []
            for i, item in enumerate(item_lis):
                dic = {}
                image = item.xpath('./div/div[1]/a/img/@src')
                desc = item.xpath("./div/div[4]/a/em/text()")[0].strip()
                data_sku = item.xpath('./div/@data-sku')[0]
                venderid = item.xpath('./div/@venderid')[0]

                get_sku_url = "https://p.3.cn/prices/mgets?skuIds=J_{}".format(
                    data_sku)
                get_store_url = "https://rms.shop.jd.com/json/pop/shopInfo.action?ids={}".format(
                    venderid)
                price_dic = requests.get(get_sku_url, headers=self.headers)
                stort_dic = requests.get(get_store_url, headers=self.headers)

                price = json.loads(price_dic.content.decode())[0]['p']
                store = json.loads(
                    stort_dic.content.decode(encoding='GBK'))[0]['name']

                detail_url = item.xpath('./div/div[1]/a/@href')[0]
                detail_url = "https:{}".format(detail_url)

                detail_data = requests.get(detail_url, headers=self.headers)
                html = etree.HTML(detail_data.text)

                color = html.xpath(
                    '//*[@id="choose-attr-1"]/div/div/@data-value')
                version = html.xpath(
                    '//*[@id="choose-attr-2"]/div/div/@data-value')

                w_url = "https://c0.3.cn/stock?skuId={}&area=15_1243_3419_0&venderId={}&choseSuitSkuIds=&cat=9987,653,655".format(
                    data_sku, venderid)
                d = requests.get(w_url, headers=self.headers)
                weight = json.loads(
                    d.content.decode("GBK"))['stock'].get("weightValue")

                dic['id'] = data_sku
                dic['image'] = image
                dic['price'] = price
                dic['description'] = desc
                dic['store'] = store
                dic['url'] = detail_url
                dic['color'] = color
                dic['version'] = version
                dic['weight'] = weight
                data_list.append(dic)
                print('商品{}爬取完成'.format(data_sku))
            print("当前爬取的url是：", data_url)
            self.save_to_mongo(data_list)

Ejemplo n.º 14

0

Mostrar archivo

 def page_istrue(self, response):
     selector = etree.HTML(response)
     if selector.xpath('//div[@class="list3 clearfix"]'):
         return True
     else:
         return False

Ejemplo n.º 15

0

Mostrar archivo

    
    socket.setdefaulttimeout(5)   
    req = request.Request(url, headers=headers)
    try:
        page = request.urlopen(req).read()
    except :        
        page = ''
    return page


if __name__ == '__main__': 
    #分析网页获取数据
    url='https://finance.sina.com.cn/money/forex/hq/USDCNY.shtml'
    page = get_url(url) 
    print(page)
    tree=etree.HTML(page) 
    
    #使用xpath来解析十大流通股东
    stocktitle= tree.xpath(u"/*[@id='hotHorex']")
    print(stocktitle)
    #title =stocktitle[].text
    #getdate=title[title.find('(')+1:title.find(')')] 
    #param=[]
    #nodes=tree.xpath(u"/html/body/div[9]/div[32]/table") 
     
 
     
    #for node in nodes:
    #    for data in node:
    #        stockhold=[]  
    #        for listdata in data:

Ejemplo n.º 16

0

Mostrar archivo

Archivo: 20200712-web.py Proyecto: ghostspark/queen-I

job_info = {}
f = open("20200711-II.json", 'w', encoding='utf-8')
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
}
localtime = time.asctime(time.localtime(time.time()))
for i in range(0, 9):
    url = 'https://www.liepin.com/zhaopin/?compkind=&dqs=250&pubTime=&pageSize=40&salary=&compTag=&sortFlag=15&degradeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&siTag=bFGQTbwE_AAQSb-u11jrBw%7EE08QNgJtmOV680BaDaEpHQ&d_sfrom=search_prime&d_ckId=cacf3d164385361dba08f1766c63a3a1&d_curPage=' + str(
        i + 1
    ) + '&d_pageSize=40&d_headId=cacf3d164385361dba08f1766c63a3a1&curPage=' + str(
        i)
    #https://www.liepin.com/zhaopin/?compkind=&dqs=250&pubTime=&pageSize=40&salary=&compTag=&sortFlag=15&degradeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&siTag=bFGQTbwE_AAQSb-u11jrBw%7EE08QNgJtmOV680BaDaEpHQ&d_sfrom=search_prime&d_ckId=cacf3d164385361dba08f1766c63a3a1&d_curPage=0&d_pageSize=40&d_headId=cacf3d164385361dba08f1766c63a3a1&curPage=1
    response = r.Request(url=url, headers=headers)
    data = r.urlopen(response).read().decode("utf-8")
    data1 = etree.HTML(data)

    for j in range(40):
        #data2=data1.xpath("//div[@class='job-info']//a[@target='_blank']/@href".format(j))
        #print(data2)
        #for I in data2:
        try:
            job_title = data1.xpath("//div[@class='job-info']/h3/a/text()")[j]
        except:
            job_title = []
        try:
            job_title_url = data1.xpath(
                "//div[@class='company-info nohover']/p//a[@target='_blank']/@href"
            )[j]
        except:
            job_title_url = []

Ejemplo n.º 17

0

Mostrar archivo

Archivo: l4_author_img.py Proyecto: boluohong/lofterSpider

def run(author_url, start_time, end_time, target_tags, tags_filter_mode,
        file_update_interval):
    author_page_parse = etree.HTML(
        requests.get(
            author_url,
            headers=useragentutil.get_headers()).content.decode("utf-8"))
    # id是是获取归档页面需要的一个参数，纯数字；ip是作者在lofter的三级域名，由作者注册时设定
    author_id = author_page_parse.xpath(
        "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1]
    author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1)

    try:
        author_name = author_page_parse.xpath("//title//text()")[0]
    except:
        author_name = input("解析作者名时出现异常，请手动输入\n")
    archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr"

    query_num = 50
    data = make_data(author_id, query_num)
    head = make_head(author_url)

    try:
        print("作者名%s,lofter ip %s,主页链接 %s" %
              (author_name, author_ip, author_url))
    except:
        print("作者名中有异常符号,无法显示,lofter ip %s,主页链接 %s" % (author_ip, author_url))

    deal_file("init")
    dir_path = "./dir/author_img_file"
    # 判断博客解析进度
    if is_file_in(dir_path + "/blogs_info.json") == "finished":
        print("所有博客已解析完毕，跳转至图片下载")
    elif is_file_in(dir_path + "/blogs_info.json"):
        blogs_info = get_file_contetn(dir_path + "/blogs_info.json")
        parsed_blogs_info = get_file_contetn(dir_path +
                                             "/blogs_info_parsed.json")
        print("读取到上次运行保存的博客信息：未解析博链接%d条，已解析链接%d条，接上次继续运行" %
              (len(blogs_info), len(parsed_blogs_info)))
        parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip,
                         target_tags, tags_filter_mode, file_update_interval)
    else:
        print("开始获取归档页面数据，链接 %s (不能直接点开)" % archive_url)
        blog_infos = parse_archive_page(url=archive_url,
                                        data=data,
                                        header=head,
                                        author_url=author_url,
                                        query_num=query_num,
                                        start_time=start_time,
                                        end_time=end_time)
        parsed_blogs_info = get_file_contetn(dir_path +
                                             "/blogs_info_parsed.json")
        file_update(dir_path + "/blogs_info.json", blog_infos)
        print("归档页面数据保存完毕,开始解析博客页面")
        parse_blogs_info(blog_infos, parsed_blogs_info, author_name, author_ip,
                         target_tags, tags_filter_mode, file_update_interval)
        print("博客解析完毕，开始图片下载")
    # 判断图片保存进度
    if is_file_in(dir_path + "/imgs_info.json") == "finished":
        print("该作者首页的所有图片已保存完毕，无需操作")
    else:
        imgs_info = get_file_contetn(dir_path + "/imgs_info.json")
        imgs_info_saved = get_file_contetn(dir_path + "/imgs_info_saved.json")
        download_img(imgs_info, imgs_info_saved, author_name, author_ip,
                     file_update_interval)
        print("所有图片保存完毕")

    deal_file("del")
    print("程序运行结束")

Ejemplo n.º 18

0

Mostrar archivo

Archivo: 星座.py Proyecto: zhangjiang1203/spider

# -*- coding: utf-8 -*-
"""
Created on 2019/1/18 16:41
@Author: Johnson
@Email:[email protected]
@File: 星座.py
"""
import requests
from lxml.html import etree
import json
import time        # 导入模块

# 星座运势
response = requests.get('https://www.xzw.com/fortune/taurus/')
if not response.status_code == 200:
    print('星座运势请求错误：' + str(response.status_code))
sel =etree.HTML(response.text)
fortune = sel.xpath('//div[@class="c_box"]/div[@class="c_cont"]/p/span/text()')[0]
print(fortune)

Ejemplo n.º 19

0

Mostrar archivo

    def parse_search_page(self, res, first_date, last_date):
        """解析对账单搜索页面，构建搜索条件"""
        ys, ms, ds = first_date.split('-')
        ye, me, de = last_date.split('-')
        html = etree.HTML(res.get('msg'))
        build_pay_load = {}
        ch_date = lambda x: x if len(x) == 2 else '0%s' % x
        for e in html.xpath('//input[@type="hidden"]'):
            name = e.xpath('./@name')[0]
            try:
                value = e.xpath('./@value')[0]
            except:
                value = ""
            value = quote(value.encode('gbk'))
            if name == 'Begin_date':
                value = ''.join([ys, ms, ch_date(ds)])
            elif name == 'End_date':
                value = ''.join([ye, me, ch_date(de)])
            elif name == 'Qry_date':
                value = ys
            build_pay_load[name] = value

        base_payload = {
            'Corpor_id': '1',
            'Account_num': '4000010109200194412',
            'yearname1': ys,
            'dayname1': ds,
            'yearname2': ye,
            'monthname2': me,
            'dayname2': de,
            'monthname1': ms,
        }
        build_pay_load.update(base_payload)
        build_pay_load = '&'.join(
            ['%s=%s' % (k, v) for k, v in build_pay_load.items()])
        search_url = '%s/servlet/com.ibm.btt.cs.servlet.CSReqServlet' % self.base_url
        # 搜索对账单
        res = self.deal_result(self.execute(
            'POST',
            search_url,
            data=build_pay_load,
            content_type='application/x-www-form-urlencoded'),
                               err_type='icbc')
        if not res.get('succ'):
            return res

        # 分页爬取数据
        data_list = []
        for p in range(20, 200, 20):
            logger.info('begin_pos:----------------------------%s' % p)
            data, html = self.crawler_down_list(res.get('msg'))
            if not data:
                continue
            data_list.extend(data)
            payload = self.build_next_payload(html, p)
            if not payload:
                time.sleep(0.25)
                continue
            res = self.deal_result(
                self.execute('POST',
                             search_url,
                             data=payload,
                             content_type='application/x-www-form-urlencoded'))
            if res.get('succ'):
                time.sleep(0.25)
                continue
            time.sleep(0.25)
        else:
            return {'succ': True, 'data': data_list}

Ejemplo n.º 20

0

Mostrar archivo

import requests
from lxml.html import etree
import json

url = "http://www.lovehhy.net/Joke/Detail/QSBK/"
uu = "http://www.foshannews.net/jtzb2016/"
headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

req = requests.get(uu, headers=headers).content.decode('utf-8', 'ignore')
#print(req)
rsp = etree.HTML(req)

#content = rsp.xpath('//div[@class="post_recommend_new"]//text()')
items = []
for i in rsp.xpath('//ul[@class="mbd dot f14"]/li/a'):

    cc = i.xpath('./@href')[0].strip(".")
    title = i.xpath('./@title')[0]
    cc = "http://www.foshannews.net/jtzb2016/" + cc
    eq = requests.get(cc, headers=headers).content.decode('utf-8', 'ignore')
    sp = etree.HTML(eq)

    cont = sp.xpath('//div[@class="TRS_Editor"]/p')
    result = {
        '标题': title,
        '标题网站': cc,
    }

Ejemplo n.º 21

0

Mostrar archivo

Archivo: korean_subtitle_zip.py Proyecto: Jsonming/workspace

headers = {
    "cookie":
    "__cfduid=dfa5a44a56e1f4818da6dc1c0442d32e61555031717; _"
    "ga=GA1.2.446599568.1555031722; trc_cookie_storage=taboola%2520global%253Auser-id%3Df47e0355-c5e3-4ac8-8d9c-69e65b8be1c0-tuct3a468dd; "
    "ShowSubtitleDetails=true; ShowSubtitlePreview=true; "
    "HearingImpaired=2; ForeignOnly=False; _gid=GA1.2.1534139390.1556500043; LanguageFilter=28; "
    "cookieconsent_dismissed=yes; cf_clearance=5c22147cf3e89737a1f9ac602ed6b8491cc6bc33-1556588618-31536000-150",
    "pragma":
    "no-cache",
    "upgrade-insecure-requests":
    "1",
    "user-agent":
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
with open('url.txt', 'r', encoding='utf8') as f:
    urls = f.readlines()

url_list = []
for index, url in enumerate(urls):
    url = url.strip()
    session = requests.session()
    resp = session.get(url=url, headers=headers)
    root = etree.HTML(resp.text)
    result = root.xpath('//div[@class="download"]/a/@href')
    result = ["https://subscene.com" + short_url for short_url in result]
    url_list.extend(result)

with open('download_url.txt', 'w', encoding='utf8') as f:
    f.writelines("\n".join(url_list))

Ejemplo n.º 22

0

Mostrar archivo

Archivo: creawl_html.py Proyecto: Jsonming/CommenSpider

# @Time    : 2020/1/7 10:41
# @Author  : yangmingming
# @Site    : 
# @File    : creawl_html.py
# @Software: PyCharm
import sys
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtWebKitWidgets import *
from lxml.html import etree


class WebRender(QWebPage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebPage.__init__(self)
        self.loadFinished.connect(self.__loadFinished)
        self.mainFrame().load(QUrl(url))
        self.app.exec_()

    def __loadFinished(self, result):
        self.frame = self.mainFrame()
        self.app.quit()


url = "https://www.baidu.com"
r = WebRender(url)
html = r.frame.toHtml()
print(html)
page = etree.HTML(html.encode('utf-8'))

Ejemplo n.º 23

0

Mostrar archivo

Archivo: gevent_mobile.py Proyecto: Wjun0/jd_mobile

    def parse_one(self, url, i):

        print("开始爬取第 {} 页数据".format(i))
        data_url = url + '&page=' + str(i)
        data = requests.get(data_url, headers=self.headers)
        html = etree.HTML(data.content.decode())
        item_lis = html.xpath('//*[@id="plist"]/ul/li')
        data_list = []
        for i, item in enumerate(item_lis):
            dic = {}
            image = item.xpath('./div/div[1]/a/img/@src')
            desc = item.xpath("./div/div[4]/a/em/text()")[0].strip()
            data_sku = item.xpath('./div/@data-sku')[0]
            venderid = item.xpath('./div/@venderid')[0]

            print("开始爬取商品 {} ".format(data_sku))

            get_sku_url = "https://p.3.cn/prices/mgets?skuIds=J_{}".format(
                data_sku)

            # ft = Redis_filter()
            # d = ft.get(get_sku_url)
            # if d:
            #     print('__________该商品已经存在！________跳过该商品')
            #     continue
            # ft.save(get_sku_url)
            get_store_url = "https://rms.shop.jd.com/json/pop/shopInfo.action?ids={}".format(
                venderid)
            price_dic = requests.get(get_sku_url, headers=self.headers)
            stort_dic = requests.get(get_store_url, headers=self.headers)

            price = json.loads(price_dic.content.decode())[0]['p']
            store = json.loads(stort_dic.text)[0]['name']

            detail_url = item.xpath('./div/div[1]/a/@href')[0]
            detail_url = "https:{}".format(detail_url)

            detail_data = requests.get(detail_url, headers=self.headers)
            html = etree.HTML(detail_data.text)

            color = html.xpath('//*[@id="choose-attr-1"]/div/div/@data-value')
            version = html.xpath(
                '//*[@id="choose-attr-2"]/div/div/@data-value')

            w_url = "https://c0.3.cn/stock?skuId={}&area=15_1243_3419_0&venderId={}&choseSuitSkuIds=&cat=9987,653,655".format(
                data_sku, venderid)
            d = requests.get(w_url, headers=self.headers)
            weight = json.loads(
                d.content.decode("GBK"))['stock'].get("weightValue")

            dic['id'] = data_sku
            dic['image'] = image
            dic['price'] = price
            dic['description'] = desc
            dic['store'] = store
            dic['url'] = detail_url
            dic['color'] = color
            dic['version'] = version
            dic['weight'] = weight
            data_list.append(dic)
            print('商品{}爬取完成'.format(data_sku))
            ft = Redis_filter()
            d = ft.get(dic)
            if d:
                print('__________该商品已经存在！________跳过该商品')
                continue
            ft.save(dic)
            self.save_to_mongo(dic)

Ejemplo n.º 24

0

Mostrar archivo

# 要求：
'''
使用xpath 取到用户头像，用户名，用户性别，用户年龄
糗事内容

'''
import requests
from lxml.html import etree

url = "https://www.qiushibaike.com/"
r = requests.get(url)
html = etree.HTML(r.content.decode())
all_qiushi = html.xpath("//div[@id='content-left']/div")
for one in all_qiushi:
    userImg = one.xpath("./div[1]/a[1]/img/@src")
    if userImg:
        userImg = "http:" + userImg[0]
        username = one.xpath("./div[1]/a[2]/h2/text()")[0]
        userage = one.xpath("./div[1]/div/text()")[0]
        usersex = one.xpath("./div[1]/div/@class")[0]
        usersex = usersex[14:-4]
    else:
        userImg = "https://static.qiushibaike.com/images/thumb/anony.png?v=b61e7f5162d14b7c0d5f419cd6649c87"
        username = "******"
        userage = "0"
        usersex = 'man'

    userQiushi = ''.join(one.xpath("./a[1]/div/span[1]/text()")).replace(
        "\n", '')
    userQiushiImg = one.xpath("./div[@class='thumb']/a/img/@src")
    if userQiushiImg:

Ejemplo n.º 25

0

Mostrar archivo

Archivo: books_toscrape.py Proyecto: 10534368/PythonWork

        print(details_url)
        sql = f'INSERT into book_toscrape(book_title,img_url,price,star_rating,availability,details_url) VALUES ("{books_name[i]}", "{img_url}", "{prices[i][1:]}", "{star_rating[i][12:]}","{availability}","{details_url}") '
        cursor.execute(sql)
    conn.commit()
    cursor.close()


url = "http://books.toscrape.com/"

if __name__ == '__main__':
    pool = Pool(processes=4)
    results = []
    url_list = [url]
    for i in url_list:
        response = requests.get(i)
        response.encoding = "utf8"
        if response.status_code == 200:
            print(f"{i}连接成功...")
        html_text = etree.HTML(response.text)
        next_url = html_text.xpath("//li[@class='next']/a/@href")
        result = pool.apply_async(Book_Toscrape(html_text, i))
        if next_url != []:
            if i == "http://books.toscrape.com/":
                url1 = url + next_url[0]
            else:
                url1 = "http://books.toscrape.com/catalogue/" + next_url[0]
            url_list.append(url1)
        else:
            exit()
    pool.close()
    pool.join()

Ejemplo n.º 26

0

Mostrar archivo

Archivo: extract.py Proyecto: metanoia1989/PythonStudy

def fetch_chapter(chapter_name, url):
    """
    根据知识点来提取题目
    :chapter_name 经文名
    :url string 章节url
    """
    print("经文提取开始：{0} {1}".format(chapter_name, url))

    if redis.exists(PROCESSED_URLS) and redis.sismember(PROCESSED_URLS, url):
        print("经文已处理，跳过")
        return

    res = http_request(url)
    html = etree.HTML(res)

    # 提取卷名
    chapter_names = "|".join(redis.hkeys(CHAPTERS))
    volume = " ".join(html.xpath("//table[@class='content']//p[1]//text()"))
    if volume.isspace() or len(volume) == 0:
        # =_= 当页面没有p的时候，就只能全文匹配了，贼恶心
        volume = " ".join(html.xpath("//table[@class='content']//text()"))

    try:
        volume = re.search(r"({0})".format(chapter_names), volume).group()
    except Exception as e:
        volume = "小部"

    volume_id = redis.hget(CHAPTERS, volume)

    # 提取内容
    content = html.xpath("//table[@class='content']//p//text()")
    content = "\n".join(list(filter(lambda x: x != "\u3000\u3000", content)))
    [order, title] = re.search(r"(\d+-?\d?)\s+(.*)$", chapter_name).groups()

    item = {
        "title": title,
        "order": order,
        "chapter_id": volume_id,
        "content": ""
    }

    contents = []
    contents.append(content)

    # 提取其他几页
    LINK_PAGE_BASE = "http://www.chilin.edu.hk/edu/report_section_detail.asp"
    links = html.xpath("//td[@class='subtitle'][1]")[0].xpath(
        "./following-sibling::td[1]//a//@href")
    links = list(map(lambda x: LINK_PAGE_BASE + x, links))
    for link in links:
        res = http_request(link)
        html = etree.HTML(res)
        content = html.xpath("//table[@class='content']//p//text()")
        if len(content) == 0:
            content = html.xpath("//table[@class='content']//td/text()")
            content = list(
                filter(
                    lambda x: x == "\u3000\u3000" or not x.replace("|", "").
                    isspace(), content))[1:]

        content = "\n".join(
            list(map(lambda x: "\n" if x == "\u3000\u3000" else x, content)))
        contents.append(content)

    item["content"] = "\n".join(contents)

    # 插入经文
    row = db.select_one(
        "SELECT * FROM `book_article` WHERE `chapter_id`=%s and `title`=%s",
        (volume_id, item["title"]))
    if row is not None:
        print("{0} 经文已存在，无法插入！".format(item["title"]))
        redis.sadd(PROCESSED_URLS, url)
        return

    sql = """
        INSERT INTO `book_article` ( `chapter_id`, `title`, `content`, `order`) 
        VALUES ( %s, %s, %s, %s)
    """
    data = (volume_id, item["title"], item["content"], item["order"])
    db.insert(sql, data)

    redis.sadd(PROCESSED_URLS, url)
    print("此经文已插入完毕")

Ejemplo n.º 27

0

Mostrar archivo

Archivo: laws.py Proyecto: choupo-moting/WF-

    def parse_laws_data(self, response):
        """
         #hao
        classes_num = scrapy.Field()
        #中文标题
        chinese_title = scrapy.Field()
        #库别名称
        base_name = scrapy.Field()
        #颁布部门 
        issu_department = scrapy.Field()
        #效力级别
        level = scrapy.Field()
        #时效性
        timeliness = scrapy.Field()
        #颁布日期
        issu_date = scrapy.Field()
        #实施日期
        doit_date=scrapy.Field()
        #内容分类
        content_classes = scrapy.Field()

        :param response:
        :return:
        """
        item = LawsItem()
        item['classes_num'] = 0
        item['chinese_title'] = response.xpath(
            './/div[@class="left_con_top"]/div[@class="title"]/text()'
        ).extract_first('暂无').replace('\r', '').replace('\t',
                                                        '').replace('\n', '')
        list_info = response.xpath(
            '//div[@class="left_con_top"]/ul//li/div[1]/text()').extract()
        print(list_info)
        if '库别名称：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">库别名称：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['base_name'] = ','.join(dssss.xpath('//text()')).replace(
                '\t', '').replace(' ', '').replace('\r', '').replace('\n', '')
        if '颁布部门：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">颁布部门：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['issu_department'] = ','.join(
                dssss.xpath('//text()')).replace('\t', '').replace(
                    ' ', '').replace('\r', '').replace('\n', '')
        if '效力级别：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">效力级别：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['level'] = ','.join(dssss.xpath('//text()')).replace(
                '\t', '').replace(' ', '').replace('\r', '').replace('\n', '')
        if '时效性：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">时效性：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['timeliness'] = ','.join(dssss.xpath('//text()')).replace(
                '\t', '').replace(' ', '').replace('\r', '').replace('\n', '')
        if '颁布日期：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">颁布日期：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['issu_date'] = ','.join(dssss.xpath('//text()')).replace(
                '\t', '').replace(' ', '').replace('\r', '').replace('\n', '')
        if '实施日期：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">实施日期：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['doit_date'] = ','.join(dssss.xpath('//text()')).replace(
                '\t', '').replace(' ', '').replace('\r', '').replace('\n', '')
        if '内容分类：' in list_info:
            panten = re.compile(
                '<div\sclass="info_left">内容分类：</div>.*?<div\sclass="info_right.*?">(.*?)</div>',
                re.S)
            data = re.findall(panten, response.text)
            dssss = etree.HTML(data[0])
            item['content_classes'] = ','.join(
                dssss.xpath('//text()')).replace('\t', '').replace(
                    ' ', '').replace('\r', '').replace('\n', '')

        yield item

Ejemplo n.º 28

0

Mostrar archivo

def getMovieById(request):
    print("############################")
    print("getMovieById")
    id = request.GET.get('id', '')
    print(id)
    m_list = Movie.objects.filter(m_id=id)
    if len(m_list) == 0:
        messages.error(request, '电影不存在！')
        return JsonResponse({'code': 0})

    res = []
    for i in m_list:
        x = Movie_()
        x.movieId = int(i.m_id)
        x.name = i.m_name
        x.actors = i.actor
        x.cover = i.imgurl
        x.directors = i.director
        x.genres = i.type + ' ' + i.actor
        x.officialSite = 'https://v.qq.com/'
        x.regions = i.area
        x.languages = i.language
        x.mins = i.length
        x.score = i.rate / 10.0
        x.tags = i.type
        x.year = ''
        try:
            if req.urlopen(i.imgurl).status != 200:
                i.imgurl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1589887458475&di=38b6dbf53b6505b7a5cb3764c1857313&imgtype=0&src=http%3A%2F%2Fimg3.doubanio.com%2Fview%2Fgroup_topic%2Flarge%2Fpublic%2Fp108048762.jpg'
        except:
            i.imgurl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1589887458475&di=38b6dbf53b6505b7a5cb3764c1857313&imgtype=0&src=http%3A%2F%2Fimg3.doubanio.com%2Fview%2Fgroup_topic%2Flarge%2Fpublic%2Fp108048762.jpg'

        try:
            response = requests.get('https://movie.douban.com/subject/' +
                                    i.m_id + '/',
                                    headers=headers)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, "html.parser")
            x.releaseDate = soup.find("span",
                                      attrs={
                                          "property": "v:initialReleaseDate"
                                      }).get_text()
            x.releaseDate = x.releaseDate.split('(')[0]
            x.storyline = soup.find("span", attrs={
                "property": "v:summary"
            }).get_text()
            x.votes = int(
                soup.find("span", attrs={
                    "property": "v:votes"
                }).get_text())
            x.cover = etree.HTML(response.text).xpath('//img/@src')[0]
        except:
            x.storyline = '亲爱的用户，很抱歉未获取到相应数据。'
            x.releaseDate = '0000-00-00'
            x.votes = 2032805
            x.cover = i.imgurl
        x.actorIds = ''
        x.directorIds = ''
        res.append(x)
    # m_list = serializers.serialize("json", res)
    m_list = json.dumps(res, default=lambda obj: obj.__dict__)
    return JsonResponse({'code': 1, 'm_list': m_list})

Ejemplo n.º 29

0

Mostrar archivo

 def Spider(self):
     jobl = []
     for page in range(self.page):
         params = {
             "start": 90 * page,
             "pageSize": 90,
             "workExperience": -1,
             "education": -1,
             "companyType": -1,
             "employmentType": -1,
             "jobWelfareTag": -1,
             "kw": self.keyword,
             "kt": 3,
             "cityId": self.city,
             "salary": '0, 0'
         }
         req = requests.get(url=self.base_url,
                            params=params,
                            headers=get_header())
         cookie = req.cookies
         print(cookie)
         data = req.json()['data']['results']
         if len(data) != 0:
             for job in data:
                 # print(job)
                 jobd = {}
                 jobd['ID'] = job.get('number')
                 jobd['工作名称'] = job.get('jobName')
                 jobd['招聘详细链接'] = job.get('positionURL')
                 company = job.get('company')
                 jobd['公司名称'] = company.get('name')
                 jobd['公司ID'] = company.get('number')
                 jobd['公司性质'] = company.get('type').get('name')
                 jobd['公司规模'] = company.get('size').get('name')
                 jobd['公司招聘主页'] = company.get('url')
                 jobd['公司地点'] = job.get('city').get('display')
                 jobd['薪资'] = job.get('salary')
                 jobd['学历要求'] = job.get('eduLevel').get('name')
                 try:
                     jobd['工作经历'] = job.get('workingExp').get('name')
                 except:
                     jobd['工作经历'] = '经验不限'
                 jobd['职位类型'] = job.get('emplType')
                 jobd['公司福利'] = '、'.join(job.get('welfare')) or '无'
                 jobd['工作发布标签'] = job.get('timeState')
                 jobd['更新时间'] = job.get('updateDate')
                 header = get_header()
                 header['referer'] = job.get('positionURL')
                 header['upgrade-insecure-requests'] = '1'
                 header['cookie'] = config.ZHILIAN_COOKIE
                 req1 = requests.get(
                     job.get('positionURL'),
                     headers=header,
                 )
                 req1.encoding = 'utf-8'
                 html = etree.HTML(req1.text)
                 detail = ''.join(
                     html.xpath(
                         '//*[@class="describtion__detail-content"]//*/text()'
                     ))
                 if not detail:
                     detail = ''.join(
                         html.xpath(
                             '//*[@class="describtion__detail-content"]/text()'
                         ))
                 print(job.get('positionURL'))
                 print(detail)
                 jobd['职位描述'] = detail.strip()
                 jobl.append(jobd)
         else:
             break
     return jobl

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test1.py Proyecto: 17ayst/www.fmprc.gov.cn

def prase2(txt):
    html0 = etree.HTML(txt)
    list1 = html0.xpath('//*[@id="content"]/p/text()')
    return list1