Ejemplo n.º 1
0
def getUrl():
    types = type()
    print(types)
    print(len(types))
    for diftype in types:
        diftype = diftype.strip()
        print(diftype * 3)
        diftype = urllib.parse.quote(diftype)
        url_ = 'https://book.douban.com/subject_search?search_text=' + diftype + '&cat=1001&start='
        failed_url = []
        for j in range(50, 134):
            url = url_ + str(j * 15)
            print(url)
            res = requests.get(url)
            soup = bs(res.text, 'html.parser')
            try:
                for i in range(15):
                    onepage_url = soup.select(
                        '.sc-bZQynM .item-root .detail .title a'
                    )[i].attrs['href']
                    print(onepage_url)
                    writeurl2txt('出版社txt.txt', onepage_url)
            except Exception as e:
                print(e)
                print('页面可能没有134页')
                failed_url.append(url)
                writeurl2txt('dailedurl.txt', url)
            continue
Ejemplo n.º 2
0
def main():
    hrefs = next_url()
    for href in hrefs:
        result = parser(href)
        print(result)
        write2csv('csvFiles/香港教育特藏.csv', result)
        writeurl2txt('csvFiles/香港教育特藏.txt',href)
Ejemplo n.º 3
0
def parser():
    result = {}
    for z in range(28, 30):
        next_url = 'http://idp.nlc.cn/database/search_results.a4d?uid=-9761261559;bst=' + str(
            1 + z * 50)
        res = requests.get(next_url)
        soup = bs(res.text, 'html.parser')
        trs = soup.select('#results tr')
        print('正在处理第*******' + str(z) + '*********页')
        for tr in trs:
            picture_detail_url = 'http://idp.nlc.cn/database/' + tr.select(
                '.thumb a')[0].attrs['href'].strip()
            result['图片详情页链接'] = picture_detail_url
            picture_url = 'http://idp.nlc.cn' + tr.select(
                'img')[0].attrs['src'].strip()
            result['图片链接'] = picture_url
            detail_url = 'http://idp.nlc.cn/database/' + tr.select(
                '.resultdetails a')[0].attrs['href'].strip()
            result['详情页链接'] = detail_url
            # institution = tr.select('.resultdetails a')[0].text.strip()
            year = tr.select('.resultdetails a')[1].text.strip()
            result['未知信息'] = year
            details = tr.select('.resultdetails')[0].text.strip().replace(
                '\n', ' ').replace('\t', '')
            # yizhi = re.findall('.*?遺址:(.*?)語言/.*?',details)[0].strip()
            language = re.findall('.*?語言/文字: (.*?) 材料:.*?', details)[0].strip()
            result['语言'] = language
            # material = re.sub('.*?材料:','',details).strip()
            try:
                res1 = requests.get(picture_detail_url, timeout=75)
            except:
                failed_urls = []
                failed_urls.append(picture_detail_url)
                writeurl2txt('failedurl.txt', picture_detail_url)
                continue
            soup1 = bs(res1.text, 'html.parser')
            duis = soup1.select('#iteminfotable tr')
            print(1111111111)
            for dui in duis:
                label = dui.select('td')[0].text.strip()
                value = dui.select('td')[1].text.strip()
                result[label] = value
                print(222222222)
            print(result)
            write2csv('敦煌国际项目.csv', [
                result.get('图片详情页链接', ''),
                result.get('图片链接', ''),
                result.get('详情页链接', ''),
                result.get('未知信息', ''),
                result.get('语言', ''),
                result.get('收藏機構及版權:', ''),
                result.get('遺址:', ''),
                result.get('藏品形態:', ''),
                result.get('材料:', ''),
                result.get('尺寸 (h x w) 釐米:', '')
            ])
Ejemplo n.º 4
0
def next_page():
    urls = []
    for i in range(18, 19):
        next_url = 'http://www.pharmnet.com.cn/tcm/zybb/index.cgi?p=' + str(
            i) + '&f=&terms=&s1=&cate1=&cate2='
        res = requests.get(next_url)
        soup = bs(res.text, 'html.parser')
        for j in range(0, 24, 2):
            url = 'http://www.pharmnet.com.cn' + soup.select(
                'table .border td .border a')[j].attrs['href'].strip()
            urls.append(url)
            writeurl2txt('中药图谱url.txt', url)
Ejemplo n.º 5
0
def getUrl():
    types = type()
    print(types)
    print(len(types))
    for diftype in types:
        diftype = diftype.strip()
        print(diftype*3)
        diftype = urllib.parse.quote(diftype)
        url_ = 'https://book.douban.com/subject_search?search_text=' + diftype + '&cat=1001&start='
        failed_url = []
        for j in range(134):
            url = url_ + str(j*15)
            print(url)
            # service_args = [
            #     '--proxy=114.237.228.195:28422',
            #     '--proxy-type=http',
            #     '--load-images=no',
            #     '--disk-cache=yes',
            #     '--ignore-ssl-errors=true'
            # ]
            chrome_options = Options()
            chrome_options.add_argument('--headless--')
            chrome_options.add_argument('--disable-gpu')
            # path = r'D:\Wangyuanyuan\工作\爬虫\chromedriver.exe'
            # driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options)

            # phantomjsdriver = r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe'
            # driver = webdriver.PhantomJS(phantomjsdriver, service_args=service_args)

            driver = webdriver.PhantomJS(r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe')
            driver.get(url)
            time.sleep(0.2)
            soup = bs(driver.page_source,'html.parser')
            try:
                for i in range(15):
                    onepage_url = soup.select('.sc-bZQynM .item-root .detail .title a')[i].attrs['href']
                    print(onepage_url)
                    writeurl2txt('出版社txt.txt',onepage_url)
            except Exception as e:
                print(e)
                print('页面可能没有134页')
                failed_url.append(url)
                writeurl2txt('dailedurl.txt',url)
            continue
Ejemplo n.º 6
0
def main(rds):
    # 从rds里取详情页url,请求 并 解析
    ip = getIP()
    flag = 1
    while flag:
        detailUrl = rds.spop('dbds')
        if not detailUrl:
            flag = 0
        try:
            res = requests.get(url=detailUrl, proxies={'https': ip}, verify=False)
            # time.sleep(1)
        except Exception as e:
            rds.sadd('dbds', detailUrl)
            ip = getIP()
            if not ip:
                sys.exit('IP用完了')
            print(f'请求出错,错误原因:\n{e}已更换IP:{ip}')
            logging.info(f'请求出错,错误原因:[{e}],链接:{detailUrl}')
            continue

        if '检测到有异常' in res.text:
            ip = getIP()
            if not ip:
                sys.exit('IP用完了')
            print('检测到IP有异常,已更换IP:', ip)
            rds.sadd('dbds', detailUrl)

        if '页面不存在' in res.text:
            continue

        try:
            result = dbdsParser(detailUrl, res.text)
        except:
            writeurl2txt('data/解析错误的URL.txt',detailUrl)
        else:
            write2csv('data/豆瓣读书1030_2.csv', result)
            writeurl2txt('data/豆瓣读书存在的7位数URL.txt',detailUrl)
Ejemplo n.º 7
0
def getAllUrlUseTag(rds):
    ip = getIP()
    flag = 1
    while flag:
        url = rds.spop('dbfl')
        try:
            res = requests.get(
                url=url,
                verify=False,
                proxies={'https': ip},
                cookies={
                    'cookies':
                    'bid=TX46Fh960Io; gr_user_id=9472f59e-3423-469c-a898-4d7be0efe16f; _vwo_uuid_v2=D945973C56E9DE5A89F4A407FF5B9F65B|8193048ef938ca0f9e21e82b5744da7a; __yadk_uid=IPSJiIkXJpASML3BRiVvfPmTQxziqRaY; viewed="2230208_25849649_1019210_6849293_6849290_20365152_2060130_6885810_25780889_3315384"; ct=y; ps=y; push_noty_num=0; push_doumail_num=0; dbcl2="179755333:lBCXZdA+b1Y"; __utmv=30149280.17975; ck=Ybkc; __utmc=30149280; __utmz=30149280.1539673041.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=81379588; __utmz=81379588.1539673041.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_cs1_ffc94504-020a-4b55-a144-fc8e796f6f1c=user_id%3A1; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1539679774%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsuTcGShpmJjLainnnS6EuguD_DelMI8XRcQh3k6YmQ-S9Wsyxf3kOfuoYJfimrjL%26wd%3D%26eqid%3De2bd69540001c29e000000065bc58bc9%22%5D; _pk_ses.100001.3ac3=*; __utma=30149280.322353021.1539312805.1539677732.1539679774.6; __utma=81379588.2102712258.1539312976.1539677732.1539679774.6; ap_v=0,6.0; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=cf00eb62-9699-4cb3-a2cf-477014a9081e; gr_cs1_cf00eb62-9699-4cb3-a2cf-477014a9081e=user_id%3A1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_cf00eb62-9699-4cb3-a2cf-477014a9081e=true; __utmb=81379588.10.10.1539679774; _pk_id.100001.3ac3=d01456c0712c87d8.1539312977.6.1539681674.1539677742.; douban-fav-remind=1; __utmb=30149280.94.4.1539681799685'
                })
        except:
            ip = getIP()
            print(ip)
            rds.sadd('dbfl', url)
            continue
        if '检测到有异常请求' in res.text:
            print('检测到有异常请求')
            ip = getIP()
            print(ip)
            rds.sadd('dbfl', url)
            continue

        time.sleep(0.8)

        soup = BeautifulSoup(res.text, 'html.parser')
        a_tags = soup.select('#subject_list .nbg')
        for tag in a_tags:
            href = tag.attrs.get('href', '')
            writeurl2txt('豆瓣读书书籍URL.txt', href)
            rds.sadd('dbds', href)
        leftNums = rds.scard('dbfl')
        print('rds剩余:', leftNums)
        flag = leftNums
Ejemplo n.º 8
0
def parse():
    allurls = get_allurl()
    print(allurls)
    print(len(allurls))
    result = {}
    for url in allurls:
        print(url)
        if 'package' not in url:
            writeurl2txt('腾讯课堂url.txt', url)
            try:
                res = requests.get(url)
                soup = bs(res.text, 'html.parser')
                result['封面图片'] = 'https://' + soup.select(
                    '.img-left--wrap img')[0].attrs['src'].strip()
                result['课程名称'] = soup.select('.title-main')[0].text.strip()
                try:
                    zuijinzaixue = soup.select(
                        '#js-statistics-apply')[0].text.strip()
                    result['最近在学人数'] = re.findall('\d+', zuijinzaixue)[0]
                    result['累计报名'] = soup.select(
                        '.js-apply-num')[0].text.strip()
                except:
                    result['购买人数'] = soup.select(
                        '#js-statistics-apply')[0].text.strip().replace(
                            '人 购买', '')
                result['好评度'] = soup.select('.rate-num')[0].text.strip()
                result['课程价格'] = soup.select(
                    '.course-price-info ')[0].text.strip().replace('¥', '')
                tnames = []
                for teacher in soup.select('.teacher-list .teacher-item'):
                    tname = teacher.select('.js-teacher-name')[0].text.strip()
                    tnames.append(tname)
                result['讲师姓名'] = ';'.join(tnames)
                result['课程介绍'] = soup.select('.tb-course td')[0].text.strip()
                result['授课机构名称'] = soup.select(
                    '.js-agency-name')[0].text.strip()
                result['机构好评度'] = soup.select(
                    '.tree-list span')[0].text.strip()
                result['机构课程数'] = soup.select(
                    '.tree-list span')[1].attrs['data-num'].strip()
                result['学习人次'] = soup.select(
                    '.tree-list span')[2].attrs['data-num'].strip()
                result['机构介绍'] = soup.select('.agency-summary')[0].text.strip()
                contacts = []
                for i in range(len(soup.select('.contact-list p'))):
                    contacts.append(
                        soup.select('.contact-list p')[i].text.strip())
                result['联系方式'] = ';'.join(contacts)
                result['页面链接'] = url
                print(result)
                write2csv('腾讯课堂.csv', [
                    result.get('页面链接', ''),
                    result.get('封面图片', ''),
                    result.get('课程名称', ''),
                    result.get('最近在学人数', ''),
                    result.get('累计报名', ''),
                    result.get('购买人数', ''),
                    result.get('好评度', ''),
                    result.get('课程价格', ''),
                    result.get('讲师姓名', ''),
                    result.get('课程介绍', ''),
                    result.get('授课机构名称', ''),
                    result.get('机构好评度', ''),
                    result.get('机构课程数', ''),
                    result.get('学习人次', ''),
                    result.get('机构介绍', ''),
                    result.get('联系方式', '')
                ])
            except Exception as e:
                print(e)