Esempio n. 1
0
def str_to_htm(s):
    return etree.HTML(s)
Esempio n. 2
0
def url_l(a):
    yifile = a
    yifile = etree.HTML(yifile)
    w = "http://sc.chinaz.com"
    url_list = []
    mz = []
    #矢量
    sl = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[1]/text()')[0]
    su = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[1]/@href')[0]
    su = w + su
    url_list.append(su)
    mz.append(sl)
    # 高清图片
    gq = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[2]/text()')[0]
    gu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[2]/@href')[0]
    gu = w + gu
    url_list.append(gu)
    mz.append(gq)
    # 图标
    tb = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[3]/text()')[0]
    tu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[3]/@href')[0]
    tu = w + tu
    url_list.append(tu)
    mz.append(tb)
    # PSD素材
    psd = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[4]/text()')[0]
    pu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos"]/a[4]/@href')[0]
    pu = w + pu
    url_list.append(pu)
    mz.append(psd)
    #字体
    zt = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[1]/text()')[0]
    zu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[1]/@href')[0]
    url_list.append(zu)
    mz.append(zt)
    # 英文字体
    yw = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[3]/text()')[0]
    ywu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[3]/@href')[0]
    url_list.append(ywu)
    mz.append(yw)
    # 音效
    yx = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[4]/text()')[0]
    yxu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no2"]/a[4]/@href')[0]
    yxu = w + yxu
    url_list.append(yxu)
    mz.append(yx)
    # PPT模板
    ppt = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[3]/text()')[0]
    ppu = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[3]/@href')[0]
    ppu = w + ppu
    url_list.append(ppu)
    mz.append(ppt)
    # 简历模板
    jl = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[4]/text()')[0]
    ju = yifile.xpath(
        '//div//div[@class="nav"]/ul/li[@class="nos no3"]/a[4]/@href')[0]
    ju = w + ju
    url_list.append(ju)
    mz.append(jl)
    # print(sl,su,gq,gu,tb,tu,psd,pu,zt,zu,yw,ywu,yx,yxu,ppt,ppu,jl,ju,url_list,mz)
    return [url_list, mz]
    :param bs64_str:  转码格式
    :return: 转码后的字符串
    '''
    font = TTFont(BytesIO(base64.decodestring(bs64_str.encode())))
    c = font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap
    ret_list = []
    for char in mystr:
        decode_num = ord(char)
        if decode_num in c:
            num = c[decode_num]
            num = int(num[-2:]) - 1
            ret_list.append(num)
        else:
            ret_list.append(char)
    ret_str_show = ''
    for num in ret_list:
        ret_str_show += str(num)
    return ret_str_show


if __name__ == '__main__':
    get_ip_from_db()
    html = requests.get(url=rent_url.format('领地OFFICE'),
                        headers=headers,
                        proxies=random.choice(proxies))
    bs64_str = re.findall("charset=utf-8;base64,(.*?)'\)", html.content)[0]
    selector = etree.HTML(html.content)
    price = selector.xpath('//*[@id="list-content"]/div[13]/div[2]/p/strong/b')
    res = get_page_show_ret(price[0].text, bs64_str)
    print res
Esempio n. 4
0
# import requests
from selenium import webdriver
from lxml import etree


header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
                        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}

url = 'http://irm.cninfo.com.cn/ircs/interaction/viewQuestionForSzse.do?questionId=5651078'

# req = requests.get(url,headers=header)
# print(req.status_code)
# page = req.text

driver = webdriver.PhantomJS()
driver.get(url)

tree = etree.HTML(driver.page_source)

x_div = tree.xpath('//div[@class="msgCnt cntcolor"]')

for div in x_div:

    print(div.xpath('./text() | ./div/text()'))
 def get_results(self,url):
     html=httpget(url)
     ehtml=etree.HTML(html)
     results=ehtml.xpath("//ul[@class='img']/li")
     return results
Esempio n. 6
0
def showurl(page):
    ht=etree.HTML(page)
    url=ht.xpath('//ul[@class="searchResultListUl"]//div[@class="searchResultJobinfo fr"]//a[@target="_blank"]/@href')
    return url
    def load_get_html(self,url):
        if url == None:
            return
        # print(url)
        try:
            proxies = proxy_pool.proxies ()
            response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8')
            selector = etree.HTML(response)
        except Exception as e:
            print('laod_get_html error:{}'.format(e))
        else:
            title = selector.xpath('//div[@class="title"]/h1/text()')
            if title != []:
                title = re.sub(r'\r|\n|\s','',title[0])
                try:
                    status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group()
                except:
                    status = '公告'
            else:
                title = None
                status = '公告'
            # print(title)
            # print(status)

            _id = self.hash_to_md5(url)

            publish_date = selector.xpath('//div[@class="extra"]/text()')
            if publish_date != []:
                publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group()
            else:
                publish_date = None
            # print(publish_date)
            area_name = '江苏-南京'
            # print(area_name)

            source = 'http://www.njgp.gov.cn/'

            table_ele_li = selector.xpath('//div[@class="cont"]/div')
            content_html = ''
            for table_ele in table_ele_li[1:4]:

                content_html += etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8')

            retult_dict = dict()
            retult_dict['_id'] = _id
            retult_dict['title'] = title
            retult_dict['status'] = status
            retult_dict['area_name'] = area_name
            retult_dict['source'] = source

            retult_dict['publish_date'] = publish_date

            retult_dict['detail_url'] = url
            retult_dict['content_html'] = str(content_html)
            retult_dict['create_time'] = self.now_time()
            retult_dict['zh_name'] = '南京市政府采购网'
            retult_dict['en_name'] = 'Nanjing City Government Procurement'
            # print(retult_dict)

            # print('列表长度为={}'.format(self.rq.r_len()))
            self.save_to_mongo(retult_dict)
Esempio n. 8
0
    def parse(self, response):
        #天猫
        if 'tmall' in str(response.meta['url']):
            content = response.text
            html = etree.HTML(content)
            """正则部分"""
            pattern_model = re.compile(r'型号</th><td>&nbsp;(.*?)</td>')
            pattern_productname = re.compile(r'>产品名称:(.+)<')
            pattern_shopname = re.compile(
                r'data-spm="d4918089"><strong>(.+?)</')
            pattern_brand = re.compile(r'品牌:&nbsp;(.+?);</')
            pattern_score = re.compile(r'shopdsr-score-con">(.+?)</')

            model = re.findall(pattern_model, content)[0]
            productname = re.findall(pattern_productname, content)[0]
            shopname = re.findall(pattern_shopname, content)[0]
            brand = re.findall(pattern_brand, content)[0]
            score = re.findall(pattern_score, content)
            """xpath部分"""
            title = html.xpath('//h1[@data-spm="1000983"]/a/text()')[0]
            """调用webdriver爬取,并设置代理"""
            browser = webdriver.Chrome()
            # options = webdriver.ChromeOptions()
            # options.add_argument('user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"')
            # options.add_argument('--proxy-server=http://122.114.31.177:8080')
            # browser = webdriver.Chrome(chrome_options=options)
            browser.get(response.meta['url'])
            #下拉
            js = "var q=document.documentElement.scrollTop=10000"
            browser.execute_script(js)
            time.sleep(2)
            html_sele = etree.HTML(browser.page_source)
            sellcount = html_sele.xpath(
                '//li[@data-label="月销量"]/div/span[2]/text()')[0]
            reviewcount = html_sele.xpath(
                '//li[@class="tm-ind-item tm-ind-reviewCount canClick tm-line3"]/div/span[2]/text()'
            )[0]
            newp = html_sele.xpath('//div[@class="tb-detail-hd"]/p/text()')[0]
            promoprice = html_sele.xpath(
                '//div[@class="tm-promo-price"]/span/text()')[0]
            button_comment = browser.find_element_by_xpath(
                '//ul[@class="tabbar tm-clear"]/li[3]')
            button_comment.click()
            time.sleep(2)
            """下拉后获取到评论小界面提取"""
            pattern_comment = re.compile(r'title="(.+)分')
            content = browser.page_source
            ratescore = re.findall(pattern_comment, content)[0]
            browser.quit()

            items = dict()
            items['model'] = model
            items['productname'] = productname
            items['shopname'] = shopname
            items['brand'] = brand
            items['score'] = score
            items['title'] = title
            items['sellcount'] = sellcount
            items['reviewcount'] = reviewcount
            items['newp'] = newp
            items['promoprice'] = promoprice
            items['ratescore'] = ratescore
            items['rank'] = response.meta['rank']
            items['url'] = response.meta['url']
            for item in items.items():
                print(item)
        else:
            items = dict()
            browser = webdriver.Chrome()
            browser.get(response.meta['url'])
            time.sleep(2)
            html = etree.HTML(browser.page_source)
            browser.quit()

            model = html.xpath('//ul[@class="attributes-list"]/li[4]/text()')
            brand = html.xpath('//ul[@class="attributes-list"]/li[3]/text()')
            shopname = html.xpath(
                '//div[@class="tb-shop-name"]/dl/dd/strong/a/text()')
            title = html.xpath('//h3[@class="tb-main-title"]/text()')
            sellcount = html.xpath(
                '//div[@class="tb-sell-counter"]/a/strong/text()')
            reviewcount = html.xpath(
                '//div[@class="tb-rate-counter"]/a/strong/text()')
            score = html.xpath('//dd[@class="tb-rate-lower"]/a/text()')
            promoprice = html.xpath(
                '//strong[@class="tb-promo-price"]/em[2]/text()')

            items['model'] = model
            items['productname'] = response.meta['productname']
            items['shopname'] = shopname
            items['brand'] = brand
            items['score'] = score
            items['title'] = title
            items['sellcount'] = sellcount
            items['reviewcount'] = reviewcount
            items['newp'] = ''
            items['promoprice'] = promoprice
            items['ratescore'] = ''
            items['rank'] = response.meta['rank']
            items['url'] = response.meta['url']
            for item in items.items():
                print(item)
Esempio n. 9
0
def crawling(url):
    """
    Grab cve specific information
    :param url: string
    :return xpth_list: list
    """
    xpth_list = []
    if url is None or url == "" or url.find("http") == -1:
        print("crawling, url:", url)
        return
    try:
        content = requests.get(url).content
    except requests.exceptions.ConnectionError:
        print('ConnectionError')
        return []
    except requests.exceptions.ChunkedEncodingError:
        print('ChunkedEncodingError')
        return []
    if content and len(content) > 1:
        html = etree.HTML(content)
        try:
            # if html.xpath(
            #         '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[3]/div[2]/div[1]'
            #         '/div[2]/span/span/a/text()') == [
            #     "N/A"] or \
            #         html.xpath(
            #             '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[2]/div[1]'
            #             '/div[2]/span/span/a/text()') == [
            #     "N/A"] or \
            #         html.xpath(
            #             '/html/body/div[2]/div[2]/div[2]/table/tr/td/div/div[1]/div[4]/div[2]/div[1]/div[2]'
            #             '/span/span/a/text()') == ['N/A']:
            #     if html.xpath(
            #             "/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[3]/div[1]/div[2]"
            #             "/span/span/a/text()") == [
            #         "N/A"] or \
            #             html.xpath(
            #                 "/html/body/div[2]/div[2]/div/table/tbody/tr/td/div/div[1]/div[2]/div[2]/div[1]"
            #                 "/div[2]/span/span/a/text()") == ["N/A"]:
            #         nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \
            #             access_vector = attack_complexity = access_complexity = \
            #             privilege_required = user_interaction = scope = confidentiality = \
            #             integrity = availability = authentication = None
            #         print("No data on this vulnerability link, ", url)
            #         score_type = ""
            #         cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0])
            #         if cve_desc:
            #             score_type = "v3.0"
            #     else:
            #         score_type = "v2.0"
            #         element = html.xpath('//*[@id="nistV2MetricHidden"]/@value')
            #         cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0])
            #         repair_time = str(
            #             html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()')[0])
            #         if repair_time is not None:
            #             repair_time = datetime.strptime(repair_time, '%m/%d/%Y')
            #         html1 = etree.HTML(element[0])
            #         cve_level = str(html1.xpath('//*[@data-testid="vuln-cvssv2-base-score-severity"]/text()')
            #                         [0].strip()).capitalize()
            #         nvd_score = str(html1.xpath('//*[@data-testid="vuln-cvssv2-base-score"]/text()')[0].strip())
            #         vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv2-vector"]/text()')[0]). \
            #             replace("(", "").replace(")", "").strip()
            #         access_vector = str(html1.xpath('//*[@data-testid="vuln-cvssv2-av"]/text()')[0].strip())
            #         access_complexity = str(html1.xpath('//*[@data-testid="vuln-cvssv2-ac"]/text()')[0].strip())
            #         authentication = str(html1.xpath('//*[@data-testid="vuln-cvssv2-au"]/text()')[0].strip())
            #         confidentiality = str(html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()')[0].strip())
            #         integrity = str(html1.xpath('//*[@data-testid="vuln-cvssv2-i"]/text()')[0].strip())
            #         availability = str(html1.xpath('//*[@data-testid="vuln-cvssv2-a"]/text()')[0].strip())
            #         attack_vector = attack_complexity = privilege_required = user_interaction = scope = None
            # elif html.xpath(
            #         '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[3]/div[2]/div[1]/div[2]'
            #         '/span/span/a/text()') == [] and \
            #         html.xpath(
            #             '/html/body/div[2]/div[2]/div/table/tr/td/div/div[1]/div[2]/div[2]/div[1]'
            #             '/div[2]/span/span/a/text()') == []:
            #     nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \
            #         access_vector = attack_complexity = access_complexity = \
            #         privilege_required = user_interaction = scope = confidentiality = integrity = \
            #         availability = authentication = None
            #     score_type = "v3.0"
            #     print("This vulnerability link not found, ", url)
            # else:
            #     score_type = "v3.0"
            #     cve_desc = str(html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')[0])
            #     repair_time = html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()')[0]
            #     if repair_time is not None:
            #         repair_time = datetime.strptime(repair_time, '%m/%d/%Y')
            #     if html.xpath('//*[@id="nistV3MetricHidden"]/@value'):
            #         element = html.xpath('//*[@id="nistV3MetricHidden"]/@value')
            #     else:
            #         element = html.xpath('//*[@id="cnaV3MetricHidden"]/@value')
            #     html1 = etree.HTML(element[0])
            #     cve_level = str(html1.xpath('//*[@data-testid="vuln-cvssv3-base-score-severity"]/text()')[0]
            #                     .strip()).capitalize()
            #     nvd_score = str(html1.xpath('//*[@data-testid="vuln-cvssv3-base-score"]/text()')[0].strip())
            #     vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv3-vector"]/text()')[0]).replace("(", ''). \
            #         replace(')', '').strip()
            #     attack_vector = str(html1.xpath('//*[@data-testid="vuln-cvssv3-av"]/text()')[0].strip())
            #     attack_complexity = str(html1.xpath('//*[@data-testid="vuln-cvssv3-ac"]/text()')[0].strip())
            #     privilege_required = str(html1.xpath('//*[@data-testid="vuln-cvssv3-pr"]/text()')[0].strip())
            #     user_interaction = str(html1.xpath('//*[@data-testid="vuln-cvssv3-ui"]/text()')[0].strip())
            #     scope = str(html1.xpath('//*[@data-testid="vuln-cvssv3-s"]/text()')[0].strip())
            #     confidentiality = str(html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()')[0].strip())
            #     integrity = str(html1.xpath('//*[@data-testid="vuln-cvssv3-i"]/text()')[0].strip())
            #     availability = str(html1.xpath('//*[@data-testid="vuln-cvssv3-a"]/text()')[0].strip())
            #     access_vector = access_complexity = authentication = None
            nvd_score = cve_level = cve_desc = repair_time = vector_value = attack_vector = \
                access_vector = attack_complexity = access_complexity = \
                privilege_required = user_interaction = scope = confidentiality = integrity = \
                availability = authentication = None
            cve_descx = html.xpath(
                '//*[@id="vulnDetailTableView"]/tr/td/div/div[1]/p[1]/text()')
            if cve_descx is not None and len(cve_descx) > 0:
                cve_desc = str(cve_descx[0])
            # repair_timex = html.xpath('//*[@id="vulnDetailTableView"]/tr/td/div/div[2]/div/span[1]/text()')
            repair_timex = html.xpath(
                '//*[@data-testid="vuln-published-on"]/text()')
            if repair_timex is not None and len(repair_timex) > 0:
                repair_time = str(repair_timex[0])
            if repair_time is not None and repair_time != "":
                repair_time = str(datetime.strptime(repair_time, '%m/%d/%Y'))
            score_type = "v3.0"
            if html.xpath('//*[@id="nistV3MetricHidden"]/@value'):
                element = html.xpath('//*[@id="nistV3MetricHidden"]/@value')
            else:
                element = html.xpath('//*[@id="cnaV3MetricHidden"]/@value')
            if element and len(element) > 0:
                html1 = etree.HTML(element[0])
                if html1 is not None:
                    cve_level = str(
                        html1.xpath(
                            '//*[@data-testid="vuln-cvssv3-base-score-severity"]/text()'
                        )[0].strip()).capitalize()
                    nvd_score = str(
                        html1.xpath(
                            '//*[@data-testid="vuln-cvssv3-base-score"]/text()'
                        )[0].strip())
                    vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv3-vector"]/text()')[0]).replace("(",
                                                                                                                ''). \
                        replace(')', '').strip()
                    attack_vector = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-av"]/text()'
                                    )[0].strip())
                    attack_complexity = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-ac"]/text()'
                                    )[0].strip())
                    privilege_required = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-pr"]/text()'
                                    )[0].strip())
                    user_interaction = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-ui"]/text()'
                                    )[0].strip())
                    scope = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-s"]/text()')
                        [0].strip())
                    confidentiality = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-c"]/text()')
                        [0].strip())
                    integrity = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-i"]/text()')
                        [0].strip())
                    availability = str(
                        html1.xpath('//*[@data-testid="vuln-cvssv3-a"]/text()')
                        [0].strip())
                    access_vector = access_complexity = authentication = None
            else:
                element = html.xpath('//*[@id="nistV2MetricHidden"]/@value')
                if element and len(element) > 0:
                    html1 = etree.HTML(element[0])
                    if html1 is not None:
                        score_type = "v2.0"
                        cve_level = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-base-score-severity"]/text()'
                            )[0].strip()).capitalize()
                        nvd_score = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-base-score"]/text()'
                            )[0].strip())
                        vector_value = str(html1.xpath('//*[@data-testid="vuln-cvssv2-vector"]/text()')[0]). \
                            replace("(", "").replace(")", "").strip()
                        access_vector = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-av"]/text()')
                            [0].strip())
                        access_complexity = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-ac"]/text()')
                            [0].strip())
                        authentication = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-au"]/text()')
                            [0].strip())
                        confidentiality = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv3-c"]/text()')
                            [0].strip())
                        integrity = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-i"]/text()')
                            [0].strip())
                        availability = str(
                            html1.xpath(
                                '//*[@data-testid="vuln-cvssv2-a"]/text()')
                            [0].strip())
                        attack_vector = attack_complexity = privilege_required = user_interaction = scope = None
            if cve_desc == 'N/A':
                cve_desc = None
            if repair_time == 'N/A':
                repair_time = None
            if nvd_score is None or nvd_score == "" or nvd_score == 'N/A':
                nvd_score = None
            print(
                "nvd_score:", nvd_score, "\n", "cve_level:", cve_level, "\n",
                "repair_time:", repair_time, "\n", "score_type:", score_type,
                "\n",
                "vector_value, attack_vector, access_vector,attack_complexity, \n"
                "access_complexity, privilege_required, user_interaction, scope,\n"
                "confidentiality, integrity, availability, authentication:\n",
                vector_value, attack_vector, access_vector, attack_complexity,
                access_complexity, privilege_required, user_interaction, scope,
                confidentiality, integrity, availability, authentication, "\n",
                "cve_desc:", cve_desc)
            xpth_list = [
                nvd_score, cve_level, cve_desc, repair_time, vector_value,
                attack_vector, access_vector, attack_complexity,
                access_complexity, privilege_required, user_interaction, scope,
                confidentiality, integrity, availability, authentication,
                score_type
            ]
        except IndexError as e:
            print("Subscript out of bounds", e)
        except UnboundLocalError as e:
            print("Tag not found", e)
    return xpth_list
Esempio n. 10
0
def doctor_line(url):#医生详情
    item4 = []
    res = download(url)
    if res == "下载失败":
        print(res)
        return res
    selector = etree.HTML(res.text)
    name = selector.xpath('//div[@class="detail word-break"]/h1/strong[@class="J_ExpertName"]')[0].text#姓名
    item4.append(name)
    if selector.xpath('//div[@class="detail word-break"]/h1/span'):
        position = ""
        for info in selector.xpath('//div[@class="detail word-break"]/h1/span'):
            position += info.text.lstrip().rstrip()#职位
    else:
        position = None
    item4.append(position)
    if selector.xpath('//div[@class="detail word-break"]/div[@id="card-hospital"]/p'):
        hospitals = ""
        for info in selector.xpath('//div[@class="detail word-break"]/div[@id="card-hospital"]/p'):
            hospital = ""
            for info in info.xpath('a|span'):
                hospital += info.text.lstrip().rstrip()
            hospitals += hospital + "/ "
    else:
        hospitals = None
    item4.append(hospitals.rstrip(" / "))
    if selector.xpath('//div[@class="detail word-break"]/div[@class="keys"]/a'):
        keys = ""
        for key in selector.xpath('//div[@class="keys"]/a'):
            keys += key.text.lstrip().rstrip() + " / "#关键字
    else:
        keys = None
    item4.append(keys)
    if selector.xpath('//div[@class="detail word-break"]/div[@class="goodat"]/a'):
        goodat = selector.xpath('//div[@class="detail word-break"]/div[@class="goodat"]/a')[0].attrib["data-description"]#擅长
    else:
        goodat = None
    item4.append(goodat)
    if selector.xpath('//div[@class="detail word-break"]/div[@class="about"]/a'):
        about = selector.xpath('//div[@class="detail word-break"]/div[@class="about"]/a')[0].attrib["data-description"]#医生简介
    else:
        about = None
    item4.append(about)
    if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'):
        evaluate = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[0].text#医生评价
    else:
        evaluate = None
    item4.append(evaluate)
    if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'):
        nr = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[1].text#预约量
    else:
        nr = None
    item4.append(nr)
    if selector.xpath('//div[@class="status"]/div[@class="data"]//strong'):
        ni = selector.xpath('//div[@class="status"]/div[@class="data"]//strong')[2].text#问诊量
    else:
        ni = None
    item4.append(ni)
    if selector.xpath('//div[@class="consult-type"]/ul/li[1]//p[@class="current-price"]'):
        tit = selector.xpath('//div[@class="consult-type"]/ul/li[1]//p[@class="current-price"]')[0].text#图文问诊价格
    else:
        tit = None
    item4.append(tit)
    if selector.xpath('//div[@class="consult-type"]/ul/li[2]//p[@class="current-price"]'):
        shihua = selector.xpath('//div[@class="consult-type"]/ul/li[2]//p[@class="current-price"]')[0].text#视听问诊价格
    else:
        shihua = None
    item4.append(shihua)
    item4.append(url)

    print("++++++++++++++++++完成一项")
    return item4
def parse_html(html):
    s = etree.HTML(html)
    addr_info = s.xpath('//p[@class="result"]/text()')
    for n in addr_info:
        print(n)
def Film():

    for a in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(a)
        ua = UserAgent(use_cache_server=False)
        ip = ['175.148.79.101', '114.220.29.95', '222.190.217.156']

        print("开始爬取数据")
        html = requests.get(url,
                            proxies={'http': random.choice(ip)},
                            headers={'User-Agent': ua.random})

        selector = etree.HTML(html.text)
        infos = selector.xpath('//ol[1][@class="grid_view"]')

        for info in infos:
            number = info.xpath('//div[@class="pic"]/em/text()')
            name = info.xpath('//span[1][@class="title"]/text()')
            messages = info.xpath('//div[@class="bd"]/p/text()[2]')
            # print(messages)
            a = 0
            typelist = []
            yearlist = []
            peoplelist = []

            while a < len(messages):
                m = messages[a]

                year = m.split('/')[0]
                year = re.findall(r"\d+", year)
                years = ''.join(year)

                type = m.split('/')[-1]
                type = (type.split(' ')[1])
                type = (type.split('\n')[0])

                yearlist.append(years)
                typelist.append((type))
                a += 2
            # print(yearlist)
            # print(typelist)

            source = info.xpath('//span[@class="rating_num"]/text()')
            peoples = info.xpath('//div[@class="star"]/span[4]/text()')

            for i in peoples:
                people = re.findall(r"\d+", i)
                people = ''.join(people)
                peoplelist.append(people)

            time.sleep(random.randint(1, 3))

        list = []

        for (i1, i2, i3, i4, i5, i6) in zip(number, name, yearlist, typelist,
                                            source, peoplelist):
            list.append((i1, i2, i3, i4, i5, i6))
        for i in list:
            print(i)
        SQ = pymysql.connect(host='localhost',
                             port=3306,
                             user='******',
                             passwd='123456',
                             db='doubanfilm',
                             charset='utf8')

        L = SQ.cursor()
        L.executemany(
            "INSERT INTO doubanfilm(number, name, year, type, source, people) VALUES "
            "(%s,%s,%s,%s,%s,%s)", list)
        SQ.commit()

    print('传入数据库完成')
Esempio n. 13
0
def second_parse(title):
    '''
    获取热点详细信息
    :param title:
    :return:
    '''
    for it in title:
        time.sleep(5)
        redian = {}
        redian["热点"] = it
        url = 'https://s.weibo.com/weibo?q=%23{}%23'.format(urllib.parse.quote(it))
        pattern = re.compile('[\u4e00-\u9fa5]+(.*?[\u4e00-\u9fa5])')
        print(url)
        try:

            source_code = requests.get(url, headers=headers)
            if source_code.status_code == 200:
                # with open('redian.html', 'wb') as fp:
                #     fp.write(source_code.content)
                tree = etree.HTML(source_code.text)
                # 爬取该热点阅读量
                Reading_volume = tree.xpath('//span/text()')
                if len(Reading_volume) != 0:
                    Reading_volume = Reading_volume[0]
                if len(Reading_volume) != 0:
                    Reading_volume = pattern.findall(Reading_volume)
                if len(Reading_volume) > 0:
                    Reading_volume = Reading_volume[0]
                # if len(Reading_volume) > 0:
                #     Reading_volume = Reading_volume[0]
                redian["阅读量"] = Reading_volume
                # 爬取热点讨论数
                # discuss_amount = tree.xpath('//span/text()')[1]
                # discuss_amount = pattern.findall(discuss_amount)[0]
                # # if len(discuss_amount) > 0:
                # #     discuss_amount = discuss_amount[0]
                # redian["讨论数"] = discuss_amount
                # 记录爬取时间
                redian["时间"] = time.asctime(time.localtime(time.time()))
                # 爬取热点导语内容
                content = tree.xpath('//div[@class="card-wrap"]/div/p/text()')
                if len(content) > 0:
                    content = content[0] + url
                else:
                    content.append(url)
                if isinstance(content, list):
                    redian["热点导语"] = content[0]
                else:
                    redian["热点导语"] = content

                print(redian)

                try:
                    with open(os_path, 'a+', encoding="utf-8") as fp:
                        fp.write(str(redian) + '\n')
                except IOError as err:
                    print("文件写入失败")
                amount, time_now = parse_readAmount_time(redian['时间'] + ' ' + redian['阅读量'])
                content, url_c = parse_url_content(redian["热点导语"])
                title = it
                # 执行sql语句 redian['时间'] + ' ' + redian['阅读量']
                try:
                    with connection.cursor() as cursor:
                        delSql = "DELETE FROM Hotspot WHERE title = %s"
                        sql = "INSERT INTO Hotspot (title, amount, daytime, content, url, keywords) VALUES (%s, %s, %s, %s, %s, %s)"
                        cursor.execute(delSql, (title))
                        cursor.execute(sql, (title, amount, time_now, content, url_c, ''))
                    # 没有设置默认自动提交,需要主动提交,以保存所执行的语句
                    connection.commit()
                finally:
                    pass
        except RequestException:
            print("响应请求失败")
from HtmlRetrival import HtmlRetrival
from lxml import etree

html_re = HtmlRetrival('http://bbs.qyer.com/thread-2631045-1.html')
content = html_re.get_content()

tags = {
    'title':
    '//h3[@class="b_tle"]',
    'content':
    '//td[@class="editor bbsDetailContainer"]//*[self::p or self::span or self::h1]'
}

tr = etree.HTML(content)
info = {}

f = open('template.txt', 'wb')

for tag in tags:
    info[tag] = []
    f.write('\r\n\r\n' + tag + '\r\n\r\n')
    eles = tr.xpath(tags[tag])
    for ele in eles:
        if ele is None or ele.text is None:
            continue
        info[tag].append(ele.text)
        f.write(ele.text.encode('utf-8') + '\r\n')

f.close()
Esempio n. 15
0
    "User-Agent":
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
proxies = {
    "http": "socks5://127.0.0.1:1080",
    'https': 'socks5://127.0.0.1:1080'
}

base_url = 'https://gelbooru.com/index.php?page=post&s=list&tags=animated&pid={}'

for page_num in range(10):
    try:
        r_page = requests.get(base_url.format(page_num * 42))
    except:
        pass
    html = etree.HTML(r_page.content)
    item_urls = html.xpath(
        "//div[@class='contain-push']/div[@class='thumbnail-preview']//a/@href"
    )
    for item_url in item_urls:
        r_item = requests.get("https:" + item_url)
        item_html = etree.HTML(r_item.content)
        video_url = item_html.xpath("//video/source/@src")
        if len(video_url) > 0:
            video_url = video_url[0]
            print('downloading: ', page_num, video_url)
            with open('./public/gelbooru/' + video_url.split('/')[-1],
                      'wb') as f:
                try:
                    f.write(requests.get(video_url).content)
                except Exception as e:
Esempio n. 16
0
import requests
import csv
from lxml import etree
import json
import re

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

r = requests.get('http://www.seputu.com/', headers=headers)

html = etree.HTML(r.text)
div_mulus = html.xpath('//*[@class="mulu"]')

rows = []
for div_mulu in div_mulus:
    # 标题
    div_h2 = div_mulu.xpath('.//div[@class="mulu-title"]/center/h2/text()')
    if len(div_h2) > 0:
        # print(h2.string)
        h2_title = div_h2[0]
        a_s = div_mulu.xpath('./div[@class="box"]/ul/li/a')
        # list = []
        # 获取章节内容与url地址
        for a in a_s:
            # print(a)
            href = a.xpath('./@href')[0]
            box_title = a.xpath('./@title')[0]
            pattern = re.compile(r'\s*\[(.*)\]\s+(.*)')
Esempio n. 17
0
import requests
from lxml import etree
import re
import os
def getpic(url):
    name=mmurl.split('/')[-1]
    print(name)
    picdata=requests.get(url)
    with  open('D:\\mzsock\\'+name,'wb') as f:
        f.write(picdata.content)
r=requests.get("http://mzsock.com/mv/")
r.encoding='utf-8'
r=r.text
ehtml=etree.HTML(r)
nurl=ehtml.xpath('//*/li/div/a[@class="img"]/@href')
for mz in nurl:
    mm=requests.get(mz)
    mm.encoding='utf-8'
    mm=mm.text
    mhtml=etree.HTML(mm)
    murl=mhtml.xpath('//*/a[@class="image_cx_cont"]/img/@src')

    for mmurl in murl:
        getpic(mmurl)
        

    

                 

Esempio n. 18
0
from lxml import html
from lxml import etree
from collections import OrderedDict
import requests
from ezraLibrary import textOfDiv

link = "https://www.facebook.com/careers/jobs/a0I1200000JY01QEAT/"

response = requests.get(link)  #get page data from server, block redirects
sourceCode = response.content  #get string of source code from response
htmlElem = html.document_fromstring(sourceCode)  #make HTML element object

aDict = {}

root = etree.HTML(sourceCode)
#root.findall(".//table")
aDict = {}
for e1 in root.iterfind(".//div"):
    texto = textOfDiv(e1)
    palabras = texto.split()
    for palabra in palabras:
        aDict[palabra] = palabra

for item in aDict:
    print(item.encode('utf-8'))
Esempio n. 19
0
import requests
from lxml import etree

html_str=open("index.html",'r',encoding='utf-8').read()
# print(html_str)
# 使用lxml解析html源代码
html=etree.HTML(html_str) # soup=BeautifulSoup(r.text,"html.parser")
# print(html) # <Element html at 0x1db33b459c8>
# 想要查看 节点的源代码,使用etree.tostring(节点对象)
# print(etree.tostring(html,encoding='utf-8').decode("utf-8"))

# xpath('xpath规则') 返回的是列表形式,如果没有查到就是空列表

# nodename 查找某个节点,
# print(html.xpath("head")) # 查找html下的head节点
# print(html.xpath("body")) # 查找html下的body节点

# / 放在开头代表从根节点选取  element/element 如果放在节点后面代表节点的下一层节点
# print(html.xpath("/html/head")) # 查找根节点下的html下的head节点
# print(html.xpath("head/title")) #查找head下面的title节点
# print(html.xpath("body/div")) # 取到body下的div,有几个显示几个,bs4只能显示第一个
# print(html.xpath("body/div")[1].xpath('ul/li')) # 取到div下标为1的,再去查找ul下的li

# // 搜索,不考虑层级查找标签
# print(html.xpath("//li")) #查找所有的li标签
# print(html.xpath("//li/text()")) #查找所有的li标签下的文本
# print(html.xpath("body/div/ul/li")) # 查找div下的ul下的li标签,如果层级不符合,则不匹配
# print(html.xpath("body/div//li")) # 在满足条件的div中搜索li标签

# . 从当前节点开始查询
# 查找所有的房源的名称和平方数
Esempio n. 20
0
    def page_info(self, jq_url):
        self.driver.get(jq_url)
        time.sleep(random.randint(5, 10))
        html = etree.HTML(self.driver.page_source)

        # print(self.sheng_name, self.shi_name, self.jq_name, self.level)

        address = self.list_to_str(html.xpath('//p[@class="linetext"]/@title'))
        opentime = ''
        if html.xpath('//div[@class="hasdown-pre"]/p/text()'):
            opentime = self.list_to_str(
                html.xpath('//div[@class="hasdown-pre"]/p/text()'))
        else:
            opentime = self.list_to_str(
                html.xpath('//div[@class="sec-inner"]/dl[2]/dd/p/text()'))

        yang_dict = self.yangtu(
            html.xpath('//dl[@class="pic_tab_dl"]/dd/img/@src'))
        xiang_dict = self.xiangtu(
            html.xpath('//div[@id="introduction"]/div[2]//img/@to'))

        div_code = self.div_code(url=jq_url).replace('to=', 'src=')

        div_code += '景区:  {}</br>'.format(self.jq_name)
        div_code += '地址:  {}</br>'.format(address)
        div_code += '开放时间:  {}</br></br>'.format(opentime)

        # 创建目录
        self.addr_path = r'//192.168.100.173/移动库/旅游景区/驴妈妈/{}/{}/{}/'.format(
            self.sheng_name, self.shi_name, self.jq_name)

        self.make_dir(addr_path=self.addr_path)
        print('创建目录成功啦!')

        car_title = html.xpath(
            '//div[@class="nchTrafficDerc clearfix"]/div[1]/ul/li/b/text()')
        car_content = html.xpath(
            '//div[@class="nchTrafficDerc clearfix"]/div[@class="nchTrafficTab"]'
        )

        traffic_info = ''
        # print('交通指南:\n')
        for k, v in zip(car_title, car_content):
            content = ''
            for i in v.xpath('./div/p//text()'):
                content += '{} </br>'.format(i)
            traffic_info += '{}</br>\n{}</br>\n'.format(k, content)
            # print(traffic_info)
            pass

        div_code += traffic_info

        if self.insert_MongoDB(jq_name=self.jq_name,
                               yang_dict=yang_dict,
                               xiang_dict=xiang_dict,
                               div_code=div_code):
            print('插入Mongo成功\n')
        else:
            if not os.path.isdir(self.addr_path):
                shutil.rmtree(self.addr_path)
                print('目录以移除!')

        pass
Esempio n. 21
0
     print "正在获取详情页面,url为"
     #url ="https://item.taobao.com/item.htm?id=538287375253&abtest=10&rn=07abc745561bdfad6f726eb186dd990e&sid=46f938ba6d759f6e420440bf98b6caea"
     #num_id = re.findall('id=[0-9]+&',url)[0].replace('id=','').replace('&','')
     #url = "https://detail.tmall.com/item.htm?id="+str(num_id)
     print url
     driver.get(url)
     driver.implicitly_wait(40)  # 设置智能超时时间
     html = driver.page_source.encode('utf-8')
     driver.quit()
 except Exception, e:
     print "页面加载失败", e
     return 0
 try:
     print '正在解析页面'
     try:
         selector = etree.HTML(
             html, parser=etree.HTMLParser(encoding='utf-8'))
     except Exception, e:
         print "页面加载失败", e
         return 0
     try:
         # 此部分用于采集每月销量的数据
         context = selector.xpath('//div[@class="tm-indcon"]')
         xiaoliang_date = u''
         for i in range(len(context)):
             temp_date = etree.tostring(
                 context[i], encoding="utf-8")  # .encode('utf-8')
             re_h = re.compile('</?\w+[^>]*>')  # 去除一切html标签
             s = re_h.sub('', temp_date) + ','
             xiaoliang_date += s
         list_date += xiaoliang_date + ';'
     except Exception, e:
Esempio n. 22
0
def info(url, animes, episodes, headers):
    r = requests.get(url, headers=headers)
    content1 = decode(r)

    tree = etree.HTML(content1)
    names = tree.xpath('//div[@class="detail con24 clear"]/dl/dd/h1/text()')
    if (names == []):
        return
    name = names[0]
    quarters = tree.xpath(
        '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label"][2]/a/text()'
    )
    if (quarters == []):
        return
    quarter = quarters[0]
    introductions = tree.xpath(
        '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label2"][3]/text()'
    )
    if (introductions == []):
        return
    introduction = introductions[0]
    times = tree.xpath(
        '//div[@class="detail con24 clear"]/dl/dd/div[@class="d_label2"][last()]/text()'
    )  # 爬取到的列表中的第三个
    if (times == []):
        return
    time = times[2]
    covers = tree.xpath(
        '//div[@class="detail con24 clear"]/dl/dt/img/@src')  # 封面的url
    if (covers == []):
        return
    cover = covers[0]
    a = anime(quarter=quarter,
              time=time,
              name=name,
              cover=cover,
              introduction=introduction)
    animes.put(a)
    e_as = tree.xpath(
        '//div[@class="time_pic list"]/div[1]/div/div/div/ul/li/a'
    )  # 第二层div有多个类似,需要从第一层找下来

    for e_a in e_as:
        e_url = e_a.xpath('string(@href)')
        e_r = requests.get(e_url, headers=headers)
        content2 = decode(e_r)
        e_tree = etree.HTML(content2)
        num = e_a.xpath('string(./em/span/text())')
        e_name = e_a.xpath('string(./em/text())')
        e_srcs = e_tree.xpath(
            '//div[@class="container clear"]/div[@class="clear"]/div[@class="player_main"]/iframe/@src'
        )
        e_src = ''
        if e_srcs != []:
            e_src = e_srcs[0]
        else:
            try:
                e_src = dynamic(e_url)
            except:
                print("Something wrong with dynamic!!\n")
                print("{} 第{}集没链接!\r\n".format(a.name, num))
        e = episode(num, e_name, e_src, a)
        episodes.put(e)
    r.close()
Esempio n. 23
0
        connection.close()
        print('向MySQL中添加数据成功!')
    except TypeError :
        pass



if __name__ == '__main__':
    options = webdriver.ChromeOptions()
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options)
    for url_str in Python_sel_Mysql():
        html = call_page(url_str)
        time.sleep(3)
        big_list = []
        selector = etree.HTML(html)
        jobs = selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/@title")
        salary = selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()")
        firms = selector.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title')
        big_list

        print(big_list)

        # insertDB(content)
        print(datetime.datetime.now())





Esempio n. 24
0
def get_order_detail_data(driver, detail_url, address_list, good_order_dict):
    """
    获取商品详情页数据:商品时间,收货人姓名,手机号,收货地址,商品名称,商品金额,商品数量
    :param driver:
    :param detail_order_list:
    :param status_list:
    :return:
    """
    data_time_list = []
    driver.get("https:" + detail_url)
    refresh_page_xpath(driver, "//span[contains(text(),'您的位置')]")
    html_str = driver.page_source.encode("utf-8").decode()
    html = etree.HTML(html_str)
    baobei_ele = html.xpath("//th[contains(text(),'宝贝')]")
    shangpin_ele = html.xpath("//dd[contains(text(),'商品')]")
    if len(baobei_ele) == 0 and len(shangpin_ele) == 0:
        good_order_dict["transaction_time"] = "0000-00-00 00:00:00"
        good_order_dict["payment_time"] = "0000-00-00 00:00:00"
        good_order_dict["confirmation_time"] = "0000-00-00 00:00:00"
        good_order_dict["receiver_name"] = ""
        good_order_dict["receiver_phone"] = ""
        good_order_dict["receiver_address"] = ""
        good_order_dict["products"] = ""
        return good_order_dict
    times_list = re.findall(r"<span.*?>(\d+-\d+-\d+ \d+:\d+:\d+)</span>?",
                            html_str)
    times_list = set(times_list)
    for i in times_list:
        data_time = datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S")
        data_time_list.append(data_time)
    data_time_list.sort()
    if len(data_time_list) >= 3:
        transaction_time = str(data_time_list[0])
        payment_time = str(data_time_list[1])
        confirmation_time = str(data_time_list[-1])
    else:
        transaction_time = str(data_time_list[-1])
        payment_time = "0000-00-00 00:00:00"
        confirmation_time = "0000-00-00 00:00:00"
    good_order_dict["transaction_time"] = transaction_time
    good_order_dict["payment_time"] = payment_time
    good_order_dict["confirmation_time"] = confirmation_time
    # 获取收货地址列表
    for addr_dict in address_list:
        receiver_name = addr_dict["receiver_name"]
        addr_list = html.xpath(
            "//span[contains(text(),'{}')]/text()".format(receiver_name))
        if len(addr_list) == 0:
            addr_list = html.xpath(
                "//dd[contains(text(),'{}')]/text()".format(receiver_name))
        if len(addr_list) == 0:
            addr_list = html.xpath(
                "//td[contains(text(),'{}')]/text()".format(receiver_name))
        if len(addr_list) > 0:
            break
    try:
        addr_list = addr_list[0]
    except Exception as e:
        addr_list = ""
    receiver_list = split_receiver_data(addr_list)
    try:
        receiver_name = receiver_list[0]
    except Exception as e:
        receiver_name = ""
    try:
        receiver_phone = receiver_list[1]
    except Exception as e:
        receiver_phone = ""
    try:
        receiver_address = receiver_list[2]
    except Exception as e:
        receiver_address = ""
    good_order_dict["receiver_name"] = receiver_name
    good_order_dict["receiver_phone"] = receiver_phone
    good_order_dict["receiver_address"] = receiver_address
    goods_list = get_good_data(driver, html)
    good_order_dict["products"] = goods_list
    return good_order_dict
Esempio n. 25
0
def we(j):
    filename = "素材.csv"
    t = 0
    for m in range(len(url_l(w())[0])):
        t += 1
        s = "html/" + url_l(w())[1][m] + ".html"
        # print(s)
        # with open(s, "r", encoding="utf-8")as file:
        #     mfile = file.read()
        sj = random.randint(1, 3)
        time.sleep(sj)
        urll = url_l(w())[0][m]
        mresponse = requests.get(urll, headers=sj_User_Agent())
        mresponse.encoding = "utf-8"
        mfile = etree.HTML(mresponse.text)

        def hh(m):
            if m + 1 == 1:
                # 矢量
                sl_url = mfile.xpath(
                    '//div//div[@class="text_left"]//div[@class="box picblock col3"]/div/a/img/@src2'
                )
                # for m in sl_url:
                #     return m
                return sl_url
            elif m + 1 == 2:
                # 高清图片
                tp_url = mfile.xpath(
                    '//div[@id="container"]/div[@class="box picblock col3"]/div/a/img/@src2'
                )
                # for m in tp_url:
                #     return m
                return tp_url
            elif m + 1 == 3:
                # 图标
                tb_url = mfile.xpath(
                    '//ul[@class="pngblock imgload"]/li/p/a/img/@src2')
                # for m in tb_url:
                #     return m
                return tb_url
            elif m + 1 == 4:
                # PSD素材
                psd_url = mfile.xpath(
                    '//div[@class="box col3 ws_block"]/a/img/@src')
                # for m in psd_url:
                #     return m
                return psd_url
            elif m + 1 == 5:
                # 字体
                zt_url = mfile.xpath(
                    '//div//div[@class="index_font_list clearfix"]//li[@class="font"]/div/a/img/@src'
                )
                # for m in zt_url:
                #     return m
                return zt_url
            elif m + 1 == 6:
                # 英文字体
                ywzt_url = mfile.xpath('//li[@class="font"]/div/a/img/@src')
                # for m in ywzt_url:
                #     return m
                return ywzt_url
            elif m + 1 == 7:
                # 音效
                yx_url = mfile.xpath(
                    '//div[@class="music_block"]/p[@class="n1"]/@thumb')
                # for m in yx_url:
                #     return m
                return yx_url
            elif m + 1 == 8:
                # PPT模板
                ppt_url = mfile.xpath(
                    '//div[@class="sc_warp  mt20"]/div[@id="main"]/div/div/a/img/@src'
                )
                # for m in ppt_url:
                #     return m
                return ppt_url
            elif m + 1 == 9:
                # 简历模板
                jl_url = mfile.xpath(
                    '//div[@class="sc_warp  mt20"]/div[@id="main"]/div/div/a/img/@src'
                )
                # for m in jl_url:
                #     return m
                return jl_url
            else:
                pass

        dic = {
            '矢量': hh(m),
            '高清图片': hh(m),
            '图标': hh(m),
            'PSD素材': hh(m),
            '字体': hh(m),
            '英文字体': hh(m),
            '音效': hh(m),
            'PPT模板': hh(m),
            '简历模板': hh(m)
        }
        # print(dic)
        list.append(dic)
        bccsv(filename, dic)
        hh = ",".join(hh(m))
        print(hh, type(hh))
        sql = "insert into sc (矢量,高清图片,图标,PSD素材,字体,英文字体,音效,PPT模板,简历模板) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
            hh, hh, hh, hh, hh, hh, str(hh), hh, hh)
        cursor.execute(sql)
        with open("素材.json", "w", encoding="utf-8") as json_file:
            json_file = json.dump({
                "total": t * j,
                "data": list
            },
                                  json_file,
                                  ensure_ascii=False,
                                  indent=4)
        print("爬取%s条" % (t))
Esempio n. 26
0
import requests
import csv
from lxml import etree
moviedata = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47'
}
for i in range(0,250,25):
    url='https://movie.douban.com/top250?start='+str(i)
    response = requests.get(url, headers=headers)
    html = etree.HTML(response.text)
    movies = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
    for movie in movies:
            #排名
            count=movie.xpath('./div/div[1]/em/text()')[0]
            #名称
            title=movie.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]
            #导演和演员
            director_actor=movie.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip('\n').strip('\xa0')
            #导演
            director=director_actor[:director_actor.find('主演')].strip()
            #演员
            actor = director_actor[director_actor.find('主演'):].strip()
            #类型地区年份
            type_region_year=movie.xpath('./div/div[2]/div[2]/p[1]/text()')[1].strip('\n').strip('\xa0')
            #年份
            year=type_region_year.split('/')[0].strip()
            #地区
            region = type_region_year.split('/')[1].strip()
            #类型
            type = type_region_year.split('/')[2].strip()
Esempio n. 27
0
def get_url_info(url_list, f_data):
    global dict_data
    # 读取字典中的数据
    f_data.seek(0, 0)
    content = f_data.read()
    if content:
        dict_data = json.loads(content)

    # 获取配置表的id,赋值给结果表
    cur.execute("SELECT id FROM t_spider_conf WHERE domain = %s", spider_url)
    conf_id = cur.fetchone()
    conf_id = conf_id[0]

    # 新闻数累加器
    sum_i = 0

    # 获取新闻栏目名
    r = requests.get(url_list[0], headers=headers)
    r.encoding = 'UTF-8'
    html = etree.HTML(r.text)
    news_heading = html.xpath('//*[@id="bok_0"]/div[@class="zzj_3"]/text()')
    news_heading = ''.join(news_heading)

    # 创建文件夹
    # 先判断文件夹是否存在,不存在则创建文件夹
    now_dir = os.getcwd()
    new_dir = now_dir + '/' + news_heading
    dir_judge = os.path.exists(new_dir)
    if not dir_judge:
        os.mkdir(new_dir)
        # print(new_dir)

    html_filter, news_url, news_title, news_author, news_time = '', '', '', '', ''

    # 对每页的每个新闻做处理
    for i, url in enumerate(url_list):

        for j in range(0, i_news):
            # 将新闻标题+内容整合,保存为字典
            # temp_info = {}

            r = requests.get(url, headers=headers)
            r.encoding = 'UTF-8'
            html = etree.HTML(r.text)
            tips = '正在获取{}栏目下第{}页第{}条新闻,总第{}条新闻......'.format(
                news_heading, i + 1, j + 1, sum_i + 1)
            print(tips)
            try:
                xpath_temp = '//*[@id="bok_0"]/div[@class="zzj_5"]/div[' + str(
                    1 + j * 2) + ']/a/'
                # temp_info['title'] = html.xpath(xpath_temp + 'span/text()')[0]
                news_title = html.xpath(xpath_temp + 'span/text()')[0]
                # 新闻的具体url
                news_url = html.xpath(xpath_temp + '@href')
                news_url = ''.join(news_url)
                # print(news_url)
                # 引入tips, 查找爬虫出错未爬取到的空的新闻内容
                # temp_info['content'] = get_url_content(news_url, tips)
                # print(temp_info)
                print('新闻标题:{}'.format(news_title))

                # 存储栏目新闻的记录,放进字典和数据库,如果已经存在,则不存储
                judge = news_url in dict_data.keys()

                if not judge:
                    dict_data[news_url] = news_title

                    res = requests.get(news_url, headers=headers)
                    res.encoding = 'UTF-8'
                    raw_html = res.text

                    # 对直接跳转的网页做处理
                    search_refresh = re.search(r'http-equiv="refresh".*\'',
                                               raw_html)
                    if search_refresh:
                        # print(search_refresh.group())
                        refresh_url = re.search(r'[a-zA-z]+://[^\s]*\w',
                                                search_refresh.group())
                        refresh_url = refresh_url.group()

                        # 使requests忽略对SSL的验证和报错, 否则会过度连接
                        urllib3.disable_warnings(
                            urllib3.exceptions.InsecureRequestWarning)
                        refresh_res = requests.get(refresh_url,
                                                   headers=headers,
                                                   verify=False)
                        refresh_res.encoding = 'UTF-8'
                        # print(refresh_res)
                        raw_html = refresh_res.text
                        judge_identifier = not_found_judge(raw_html)
                        # 对非404 not found的网页做进一步处理
                        if judge_identifier:
                            # print(raw_html)
                            html_filter = sensitive_word_filter(raw_html)
                            # 对图片新闻栏目下新闻中的图片做高清处理
                            if news_heading == '郑州大学网上新闻(图片新闻)':
                                html_filter = img_update(html_filter)

                            timestamp = round(time.time())
                            html_file = new_dir + '/' + str(
                                timestamp) + '.html'
                            pdf_file = new_dir + '/' + str(timestamp) + '.pdf'

                            try:
                                pass
                            except pymysql.err.DataError:
                                print("html编码错误或值错误!")
                                html_filter = html_filter.encode(
                                    encoding='UTF-8', errors='ignore')
                            finally:
                                time_now = datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                                cur.execute(
                                    insert_result,
                                    (conf_id, 'detail', news_url, html_filter,
                                     html_file, pdf_file, time_now,
                                     news_heading, news_title, news_author,
                                     news_time))
                                conn.commit()
                                json_data = json.dumps(dict_data)
                                f_data.seek(0, 0)
                                f_data.write(json_data)
                                print('该新闻《{}》已爬取。'.format(news_title))

                            with open(html_file, 'w+', encoding='UTF-8') as f1:
                                f1.write(html_filter)
                            # html转pdf
                            pdfkit.from_url(refresh_url,
                                            pdf_file,
                                            configuration=confg)
                            # 因跳转到不同网站的xpath不同,获取不到统一的xpath,故news_author, news_time都为空
                            print('该新闻《{}》pdf格式已转换成功。'.format(news_title))
                        else:
                            # 将404 not found 记录进数据库
                            html_filter = '404 not found'
                            time_now = datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S')
                            cur.execute(
                                insert_result,
                                (conf_id, 'detail', news_url, html_filter, '',
                                 '', time_now, news_heading, news_title,
                                 news_author, news_time))
                            conn.commit()
                            json_data = json.dumps(dict_data)
                            f_data.seek(0, 0)
                            f_data.write(json_data)
                            print('该新闻《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'.
                                  format(news_title))

                    # 对非跳转网页做处理
                    else:
                        judge_identifier = not_found_judge(raw_html)
                        # 对非404 not found的网页做进一步处理
                        if judge_identifier:
                            html = etree.HTML(raw_html)
                            news_author = html.xpath(
                                '//*[@id="bok_0"]/div[@class="zzj_4"]/span[1]/text()'
                            )
                            news_time = html.xpath(
                                '//*[@id="bok_0"]/div[@class="zzj_4"]/span[3]/text()'
                            )

                            html_filter = sensitive_word_filter(raw_html)
                            # 对图片新闻栏目下新闻中的图片做高清处理
                            if news_heading == '郑州大学网上新闻(图片新闻)':
                                html_filter = img_update(html_filter)
                            # print(html_filter)

                            timestamp = round(time.time())
                            html_file = new_dir + '/' + str(
                                timestamp) + '.html'
                            pdf_file = new_dir + '/' + str(timestamp) + '.pdf'

                            try:
                                pass
                            except pymysql.err.DataError:
                                print("html编码错误或值错误!")
                                html_filter = html_filter.encode(
                                    encoding='UTF-8', errors='ignore')
                            finally:
                                time_now = datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                                cur.execute(
                                    insert_result,
                                    (conf_id, 'detail', news_url, html_filter,
                                     html_file, pdf_file, time_now,
                                     news_heading, news_title, news_author,
                                     news_time))
                                conn.commit()
                                json_data = json.dumps(dict_data)
                                f_data.seek(0, 0)
                                f_data.write(json_data)
                                print('该新闻《{}》已爬取。'.format(news_title))

                            # 记录爬取的html原码
                            with open(html_file, 'w+', encoding='UTF-8') as f1:
                                f1.write(html_filter)

                            # 对html原码中不能正确解析的黑体做调整
                            err_index = html_filter.find('黑体')
                            if err_index != -1:
                                html_filter = html_filter[:
                                                          err_index] + '宋体' + html_filter[
                                                              err_index +
                                                              len('黑体'):]

                            # html转pdf
                            pdfkit.from_string(html_filter,
                                               pdf_file,
                                               configuration=confg)
                            print('该新闻《{}》pdf格式已转换成功。'.format(news_title))
                        else:
                            # 将404 not found 记录进数据库
                            html_filter = '404 not found'
                            time_now = datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S')
                            cur.execute(
                                insert_result,
                                (conf_id, 'detail', news_url, html_filter, '',
                                 '', time_now, news_heading, news_title,
                                 news_author, news_time))
                            conn.commit()
                            json_data = json.dumps(dict_data)
                            f_data.seek(0, 0)
                            f_data.write(json_data)
                            print('该新闻《{}》网页不存在, 以‘404 not found’为网页内容存入数据库。'.
                                  format(news_title))
                else:
                    print('该新闻《{}》已爬取过且保存在数据库中!'.format(news_title))

            except IOError:
                print("Warning: wkhtmltopdf读取文件失败, 可能是网页无法打开或者图片/css样式丢失。")
            except IndexError:
                print("该栏目《{}》下的新闻已全部爬取完!".format(news_heading))
                break

            sum_i += 1
            time.sleep(sleep_time)
            # 清空之前的信息
            html_filter, news_url, news_title, news_author, news_time = '', '', '', '', ''

        time.sleep(sleep_time)
Esempio n. 28
0
 def classify(self,url):
      data = urllib2.urlopen(url)
      code=data.read() 
      selector=etree.HTML(code) 
       
      return  selector.xpath('//h1[@class="title"]/a/text()')
Esempio n. 29
0
 def get_comments(self, comments_url):
     resp = urllib.request.urlopen(comments_url)
     html_data = resp.read().decode('utf-8')
     # 构建xpath
     html = etree.HTML(html_data)
     return html.xpath('//*[@id="comments"]/div[1]/div[2]/p/span/text()')[0]
Esempio n. 30
0
def get_info_number(start_url):
    res = get_response(start_url)
    selector = etree.HTML(res.text)
    numbers = selector.xpath('//h2[@class="total fl"]/span/text()')[0]
    return numbers