コード例 #1
0
def main():
    urls = next_page()
    for url in urls:
        print(url)
        result = parser(url)
        print(result)
        write2csv('csvFiles/中药图谱.csv', result)
コード例 #2
0
def main():
    hrefs = next_url()
    for href in hrefs:
        result = parser(href)
        print(result)
        write2csv('csvFiles/香港教育特藏.csv', result)
        writeurl2txt('csvFiles/香港教育特藏.txt',href)
コード例 #3
0
def parse():
    result = {}
    for j in range(115, 163):
        try:
            url = 'http://www.xuetangx.com/courses?credential=0&page_type=0&cid=0&process=0&org=0&course_mode=0&page=' + str(j)
            res = requests.get(url)
            print(url)
            soup = bs(res.text, 'html.parser')
            for i in range(len(soup.select('#list_style .list_inner'))):
                detail_url = 'http://www.xuetangx.com' + soup.select('.img a')[i].attrs['href'].strip()
                result['详情页面链接'] = detail_url
                result['封面图片'] = 'http://www.xuetangx.com' + soup.select('.img img')[i].attrs['src'].strip()
                result['课程标题'] = soup.select('.coursetitle')[i].text.strip()
                try:
                    try:
                        result['所属学科'] = ';'.join([soup.select('.coursename_ref')[i].select('.subject')[0].text.strip(),soup.select('.coursename_ref')[i].select('.subject')[1].text.strip()])
                    except:
                        result['所属学科'] = soup.select('.coursename_ref')[i].select('.subject')[0].text.strip()
                except:
                    result['所属学科'] = ''
                try:
                    result['简介'] = soup.select('.txt_all .txt')[i].text.strip().replace('简介', '').replace('\t', '').replace(
                        '\r\n', '').replace('\n', '')
                except:
                    result['简介'] = soup.select('.txt_all .ktxt')[i].text.strip().replace('简介', '').replace('\n', '')
                driver = webdriver.PhantomJS(r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe')
                driver.get(detail_url)
                time.sleep(0.1)
                soup1 = bs(driver.page_source, 'html.parser')
                result['课程来源'] = soup1.select('.courseabout_text a')[0].text.strip()
                result['课程描述'] = soup1.select('.course_intro .text')[0].text.strip()
                result['开课时间'] = soup1.select('.illustrate span span')[0].text.strip().replace('.', '-')
                result['结课时间'] = soup1.select('.illustrate span span')[1].text.strip().replace('.', '-')
                result['报名人数'] = soup1.select('.illustrate span span')[5].text.strip()
                teachers = soup1.select('.teacher_info .cf')
                teacher_info = []
                for teacher in teachers:
                    one_teacher = ','.join([teacher.select('.teacher_text span')[0].text.strip(),
                                            teacher.select('.teacher_text span')[1].text.strip()])
                    teacher_info.append(one_teacher)
                result['教师信息'] = ';'.join(teacher_info)
                print(result)
                write2csv('学堂在线.csv',[
                        result.get('详情页面链接', ''),
                        result.get('封面图片', ''),
                        result.get('课程标题', ''),
                        result.get('所属学科', ''),
                        result.get('简介', ''),
                        result.get('课程来源', ''),
                        result.get('课程描述', ''),
                        result.get('开课时间', ''),
                        result.get('结课时间', ''),
                        result.get('报名人数', ''),
                        result.get('教师信息', '')
                    ])
        except Exception as e:
            print(e)
            print('*********网页犯病了*********')
        continue
コード例 #4
0
def parse():
    all_urls = allUrls()
    print(all_urls)
    for url in all_urls:
        print(url)
        time.sleep(1.2)
        try:
            result = {}
            res = requests.get(url)
            soup = bs(res.text,'html.parser')
            # print(soup)
            for i in range(1,len(soup.select('.tblBrow')[0].select('td')),2):
                label = soup.select('.tblBrow')[0].select('td')[i].text.strip()
                value = soup.select('.tblBrow')[0].select('td')[i+1].text.replace('\r\n','').replace('\t','').replace('相关图书','').replace('\n','').replace('页','').strip()
                result[label] = value
            prizes = re.findall('¥(.*?)折扣价:¥(.*?)折扣:(.*?)节.*?',result['定价:'])
            result['定价:'] = prizes[0][0].strip()
            result['折扣价'] = prizes[0][1].strip()
            result['折扣'] = prizes[0][2].strip()
            result['图片'] = 'http://www.sinobook.com.cn' + soup.select('.tblBrow img')[0].attrs['src']
            for j in range(len(soup.select('.tblBrow')[1].select('.tdCaptionD'))):
                label = soup.select('.tblBrow')[1].select('.tdCaptionD')[j].text.strip()
                value = soup.select('.tblBrow')[1].select('.Text')[j].text.replace('\r\n','').replace('\t','').replace('\n','').replace('�','').strip()
                result[label] = value
            result['分类'] = '其他'
            result['页面链接'] = url
            # return result
            print(result)
            write2csv('中国高校教材图书网131313.csv', [
                    result.get('页面链接', ''),
                    result.get('书名:', ''),
                    result.get('图片', ''),
                    result.get('分类', ''),
                    result.get('ISBN:', ''),
                    result.get('条码:', ''),
                    result.get('作者:', ''),
                    result.get('装订:', ''),
                    result.get('印次:', ''),
                    result.get('开本:', ''),
                    result.get('定价:', ''),
                    result.get('折扣', ''),
                    result.get('折扣价', ''),
                    result.get('字数:', ''),
                    result.get('出版社:', ''),
                    result.get('页数:', ''),
                    result.get('发行编号:', ''),
                    result.get('每包册数:', ''),
                    result.get('出版日期:', ''),
                    result.get('内容简介:', ''),
                    result.get('作者简介:', ''),
                    result.get('章节目录:', ''),
                    result.get('精彩片段:', ''),
                    result.get('书\u3000\u3000评:',''),
                    result.get('其\u3000\u3000它:', '')
            ])
        except Exception as e:
            print(e)
            print('*********网页犯病了*********')
        continue
コード例 #5
0
def parser():
    result = {}
    for z in range(28, 30):
        next_url = 'http://idp.nlc.cn/database/search_results.a4d?uid=-9761261559;bst=' + str(
            1 + z * 50)
        res = requests.get(next_url)
        soup = bs(res.text, 'html.parser')
        trs = soup.select('#results tr')
        print('正在处理第*******' + str(z) + '*********页')
        for tr in trs:
            picture_detail_url = 'http://idp.nlc.cn/database/' + tr.select(
                '.thumb a')[0].attrs['href'].strip()
            result['图片详情页链接'] = picture_detail_url
            picture_url = 'http://idp.nlc.cn' + tr.select(
                'img')[0].attrs['src'].strip()
            result['图片链接'] = picture_url
            detail_url = 'http://idp.nlc.cn/database/' + tr.select(
                '.resultdetails a')[0].attrs['href'].strip()
            result['详情页链接'] = detail_url
            # institution = tr.select('.resultdetails a')[0].text.strip()
            year = tr.select('.resultdetails a')[1].text.strip()
            result['未知信息'] = year
            details = tr.select('.resultdetails')[0].text.strip().replace(
                '\n', ' ').replace('\t', '')
            # yizhi = re.findall('.*?遺址:(.*?)語言/.*?',details)[0].strip()
            language = re.findall('.*?語言/文字: (.*?) 材料:.*?', details)[0].strip()
            result['语言'] = language
            # material = re.sub('.*?材料:','',details).strip()
            try:
                res1 = requests.get(picture_detail_url, timeout=75)
            except:
                failed_urls = []
                failed_urls.append(picture_detail_url)
                writeurl2txt('failedurl.txt', picture_detail_url)
                continue
            soup1 = bs(res1.text, 'html.parser')
            duis = soup1.select('#iteminfotable tr')
            print(1111111111)
            for dui in duis:
                label = dui.select('td')[0].text.strip()
                value = dui.select('td')[1].text.strip()
                result[label] = value
                print(222222222)
            print(result)
            write2csv('敦煌国际项目.csv', [
                result.get('图片详情页链接', ''),
                result.get('图片链接', ''),
                result.get('详情页链接', ''),
                result.get('未知信息', ''),
                result.get('语言', ''),
                result.get('收藏機構及版權:', ''),
                result.get('遺址:', ''),
                result.get('藏品形態:', ''),
                result.get('材料:', ''),
                result.get('尺寸 (h x w) 釐米:', '')
            ])
コード例 #6
0
def parse():
    result = {}
    for i in range(14742, 72357):
        # for i in range(5,25):
        url = 'http://apabi.szlib.com/Product2.asp?lang=gb&type=&DocGroupID=2&DocID=' + str(
            i)
        try:
            res = requests.get(url)
            soup = bs(res.text, 'html.parser')
            picture_url = soup.select(
                'html body tr img')[0].attrs['src'].strip()
        except Exception as e:
            print(e)
            continue
        result['页面链接'] = url
        result['图片链接'] = picture_url
        trs = soup.select('html body tr table')[2].select('tr')
        for tr in trs:
            label = tr.select('td')[0].text.strip()
            value = tr.select('td')[1].text.replace('\n', '').replace(
                '\t', '').replace('\r', '').strip()
            result[label] = value
        print(result)
        write2csv('data/阿帕比电子图书.csv', [
            result.get('页面链接', ''),
            result.get('其它题名', ''),
            result.get('书名', ''),
            result.get('图片链接', ''),
            result.get('责任者', ''),
            result.get('主要责任关系', ''),
            result.get('主题/关键词', ''),
            result.get('摘要', ''),
            result.get('出版社', ''),
            result.get('出版地', ''),
            result.get('出版日期', ''),
            result.get('标识', ''),
            result.get('标识类型', ''),
            result.get('价格', ''),
            result.get('纸书价格', ''),
            result.get('责任编辑', ''),
            result.get('版次', ''),
            result.get('印次', ''),
            result.get('字数(千字)', ''),
            result.get('中图法分类号', ''),
            result.get('ISBN号', ''),
            result.get('附注', ''),
            result.get('外币价格', ''),
            result.get('相关文献与本文献的联系', ''),
            result.get('次要责任者', ''),
            result.get('次要责任关系', ''),
            result.get('Apabi分类号', ''),
        ])
コード例 #7
0
def parse():
    urls = getUrl()
    for url in urls:
        try:
            result = {}
            res = requests.get(url)
            soup = bs(res.text,'html.parser')
            result['页面链接'] = url
            result['标题'] = soup.select('.course-detail__title')[0].text.strip()
            result['封面图片'] = soup.select('.course-detail-img img')[0].attrs['src'].strip()
            result['分类'] = soup.select('.breadcrumb-o li')[1].text.strip()
            result['价格'] = soup.select('.course-detail__price')[0].text.strip()
            try:
                result['课程来源'] = soup.select('.gray-dark')[0].text.strip()
                result['参与人数'] = soup.select('.gray-dark')[2].text.strip().replace('人已参与','')
            except:
                result['课程来源'] = ''
                result['参与人数'] = ''
            result['开课时间'] =soup.select('.panel-body p')[0].text.strip().replace('开始:','')
            result['结课时间'] = soup.select('.panel-body p')[1].text.strip().replace('截止:', '')
            details = soup.select('.es-piece')
            for i in range(len(soup.select('.es-piece'))):
                label = details[i].select('.piece-header')[0].text.strip()
                value = details[i].select('.piece-body')[0].text.strip().replace('\n','').replace('\xa0','').replace('\r','')
                result[label] = value
            result['开课时间'] = result['开课时间'].replace('/','-')
            result['结课时间'] = result['结课时间'].replace('/', '-')
            names = []
            for j in range(len(soup.select('.row .media-body .link-dark'))):
                names.append(soup.select('.row .media-body .link-dark')[j].text.strip())
            result['教学老师'] = ';'.join(names)
            print(result)
            write2csv('中国高校外语慕课平台.csv',[
                              result.get('页面链接', ''),
                              result.get('标题',''),
                              result.get('封面图片', ''),
                              result.get('分类', ''),
                              result.get('价格', ''),
                              result.get('课程来源', ''),
                              result.get('参与人数', ''),
                              result.get('开课时间', ''),
                              result.get('结课时间', ''),
                              result.get('课程概述', ''),
                              result.get('课程介绍', ''),
                              result.get('课程目标', ''),
                              result.get('适合人群', ''),
                              result.get('教学老师', '')
                      ])
        except Exception as e:
            print(e)
            print('*********网页犯病了*********')
        continue
コード例 #8
0
def test():
    test_data = get_test_data()
    x = test_data[0]
    y = test_data[1]

    # Recreate the model.
    model = DeepSEA()
    model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
                  loss=tf.keras.losses.BinaryCrossentropy())
    model.build(input_shape=(None, 1000, 4))
    model.summary()

    # Load the weights of the old model. (The weights content the weights of model and status of optimizer.)
    # Because the tensorflow delay the creation of variables in model and optimizer, so the optimizer status will
    # be restored when the model is trained first. like: model.train_on_batch(x[0:1], y[0:1])
    model.load_weights('./result/model/ckpt')
    # model.load_weights('./result/model/bestmodel.h5')

    result = model.predict(x)  # shape = (455024, 919)

    np.savez('./result/test_result.npz', result=result, label=y)

    result = np.mean((result[0:227512], result[227512:]), axis=0)
    result_shape = np.shape(result)
    y = y[0:227512]

    fpr_list, tpr_list, auroc_list = [], [], []
    precision_list, recall_list, aupr_list = [], [], []
    for i in tqdm(range(result_shape[1]), ascii=True):
        fpr_temp, tpr_temp, auroc_temp = calculate_auroc(result[:, i], y[:, i])
        precision_temp, recall_temp, aupr_temp = calculate_aupr(
            result[:, i], y[:, i])

        fpr_list.append(fpr_temp)
        tpr_list.append(tpr_temp)
        precision_list.append(precision_temp)
        recall_list.append(recall_temp)
        auroc_list.append(auroc_temp)
        aupr_list.append(aupr_temp)

    plot_roc_curve(fpr_list, tpr_list, './result/')
    plot_pr_curve(precision_list, recall_list, './result/')

    header = np.array([['auroc', 'aupr']])
    content = np.stack((auroc_list, aupr_list), axis=1)
    content = np.concatenate((header, content), axis=0)
    write2csv(content, './result/result.csv')
    write2txt(content, './result/result.txt')
    avg_auroc = np.nanmean(auroc_list)
    avg_aupr = np.nanmean(aupr_list)
    print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
コード例 #9
0
def main():
    url = 'http://mylib.nlc.cn/web/guest/search/searchresult?p_p_id=010403_WAR_system&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_010403_WAR_system_struts.portlet.action=%2Fsecondarysearch%2Fsecondarysearch%2Fconditionssearch&_010403_WAR_system_struts.portlet.mode=view'
    for i in range(2,16):
        html = nexturl(url, i)
        print('正在爬取第%d页' % i)
        soup = bs(html,'html.parser')
        for j in range(len(soup.select('.result_item_first a'))):
            onepage_url = soup.select('.result_item_first a')[j].attrs['href'].strip()
            try:
                result = parser(onepage_url)
            except:
                time.sleep(8)
                result = parser(onepage_url)
            print(result)
            write2csv('国外汉学家1.csv', result)
コード例 #10
0
def parser():
    result = {}
    ziduans = [
        '项目批准号', '项目类别', '学科分类', '项目名称', '立项时间', '项目负责人', '专业职务', '工作单位',
        '单位类别', '所在省区市', '所属系统', '成果名称', '成果形式', '成果等级', '结项时间', '结项证书号',
        '出版社', '出版时间', '作者', '获奖情况'
    ]
    for z in range(1, 3303):
        try:
            nexturl = 'http://fz.people.com.cn/skygb/sk/index.php/Index/seach?&p=' + str(
                z)
        except:
            time.sleep(5)
            nexturl = 'http://fz.people.com.cn/skygb/sk/index.php/Index/seach?&p=' + str(
                z)
        print('**********正在打印第' + str(z) + '页***********')
        res = requests.post(nexturl)
        soup = bs(res.text, 'html.parser')
        tds = soup.select('.jc_a td')
        for i in range(0, len(tds), 20):
            for j in range(20):
                result[ziduans[j]] = tds[i + j].text.strip()
            print(result)
            write2csv('csvFiles/国家社科基金.csv', [
                result.get('项目批准号', ''),
                result.get('项目类别', ''),
                result.get('学科分类', ''),
                result.get('项目名称', ''),
                result.get('立项时间', ''),
                result.get('项目负责人', ''),
                result.get('专业职务', ''),
                result.get('工作单位', ''),
                result.get('单位类别', ''),
                result.get('所在省区市', ''),
                result.get('所属系统', ''),
                result.get('成果名称', ''),
                result.get('成果形式', ''),
                result.get('成果等级', ''),
                result.get('结项时间', ''),
                result.get('结项证书号', ''),
                result.get('出版社', ''),
                result.get('出版时间', ''),
                result.get('作者', ''),
                result.get('获奖情况', '')
            ])
コード例 #11
0
def test():
    dataset_test = get_test_data(64)

    model = DanQ()
    loss_object = keras.losses.BinaryCrossentropy()
    optimizer = keras.optimizers.Adam()
    trainer = Trainer(model=model,
                      loss_object=loss_object,
                      optimizer=optimizer,
                      experiment_dir='./result/DanQ')

    result, label = trainer.test(dataset_test,
                                 test_steps=int(np.ceil(455024 / 64)),
                                 dis_show_bar=True)

    result = np.mean((result[0:227512], result[227512:]), axis=0)
    result_shape = np.shape(result)
    label = label[0:227512]

    fpr_list, tpr_list, auroc_list = [], [], []
    precision_list, recall_list, aupr_list = [], [], []
    for i in tqdm(range(result_shape[1]), ascii=True):
        fpr_temp, tpr_temp, auroc_temp = calculate_auroc(
            result[:, i], label[:, i])
        precision_temp, recall_temp, aupr_temp = calculate_aupr(
            result[:, i], label[:, i])

        fpr_list.append(fpr_temp)
        tpr_list.append(tpr_temp)
        precision_list.append(precision_temp)
        recall_list.append(recall_temp)
        auroc_list.append(auroc_temp)
        aupr_list.append(aupr_temp)

    plot_roc_curve(fpr_list, tpr_list, './result/DanQ/')
    plot_pr_curve(precision_list, recall_list, './result/DanQ/')

    header = np.array([['auroc', 'aupr']])
    content = np.stack((auroc_list, aupr_list), axis=1)
    content = np.concatenate((header, content), axis=0)
    write2csv(content, './result/DanQ/result.csv')
    write2txt(content, './result/DanQ/result.txt')
    avg_auroc = np.nanmean(auroc_list)
    avg_aupr = np.nanmean(aupr_list)
    print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
コード例 #12
0
def main():
    url = 'http://mylib.nlc.cn/web/guest/zhonghuagujishanben?p_p_id=010453_WAR_system&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-3&p_p_col_pos=1&p_p_col_count=2&_010453_WAR_system_struts.portlet.action=%2Fsecondarysearch%2Fsecondarysearch%2FzhonghuagujiInitSearch&_010453_WAR_system_struts.portlet.mode=view'
    for i in range(325, 350):
        print('正在爬取第%d页' % i)
        html = nexturl(url, i)
        # print(html)
        #获得当前页面所有详情页的链接
        soup = bs(html, 'html.parser')
        for j in range(len(soup.select('.result_item_img a'))):
            onepage_url = soup.select(
                '.result_item_img a')[j].attrs['href'].strip()
            try:
                result = parser(onepage_url)
            except:
                time.sleep(8)
                result = parser(onepage_url)
            print(result)
            write2csv('中华古籍善本3.csv', result)
コード例 #13
0
    def parse_data(self, data):
        results = data['searchResults']['results']
        for result in results:
            item = dict()
            item['id'] = result['id']
            item['companyName'] = result['companyName']
            item['primaryUrl'] = result['primaryUrl']
            item['phone'] = result['phone']
            for i in range(len(result['addresses'])):
                if result['addresses'][i].get('country').get('name') is None:
                    item[f'country{i}'] = ''
                else:
                    item[f'country{i}'] = result['addresses'][i].get(
                        'country').get('name')

                if result['addresses'][i].get('state').get('name') is None:
                    item[f'state{i}'] = ''
                else:
                    item[f'state{i}'] = result['addresses'][i].get(
                        'state').get('name')

                if result['addresses'][i].get('city') is None:
                    item[f'city{i}'] = ''
                else:
                    item[f'city{i}'] = result['addresses'][i].get('city')

                if result['addresses'][i].get('address1') is None:
                    item[f'address{i}'] = ''
                else:
                    item[f'address{i}'] = result['addresses'][i].get(
                        'address1')
                item[f'address-{i}'] = item[f'country{i}'] + ' ' + item[f'state{i}'] + ' ' + item[f'city{i}'] + ' ' + \
                    item[f'address{i}']
            item['companyType'] = result['ownershipType'].get(
                'name') + ' ' + result['entityType'].get('name')
            item['industry'] = result['industry'].get('shortDescription')
            print(item)
            write2csv(self.payload.get('query'),
                      item,
                      fieldnames=self.fieldnames)
コード例 #14
0
ファイル: 豆瓣读书main.py プロジェクト: maitian94/gaoxiao
def main(rds):
    # 从rds里取详情页url,请求 并 解析
    ip = getIP()
    flag = 1
    while flag:
        detailUrl = rds.spop('dbds')
        if not detailUrl:
            flag = 0
        try:
            res = requests.get(url=detailUrl, proxies={'https': ip}, verify=False)
            # time.sleep(1)
        except Exception as e:
            rds.sadd('dbds', detailUrl)
            ip = getIP()
            if not ip:
                sys.exit('IP用完了')
            print(f'请求出错,错误原因:\n{e}已更换IP:{ip}')
            logging.info(f'请求出错,错误原因:[{e}],链接:{detailUrl}')
            continue

        if '检测到有异常' in res.text:
            ip = getIP()
            if not ip:
                sys.exit('IP用完了')
            print('检测到IP有异常,已更换IP:', ip)
            rds.sadd('dbds', detailUrl)

        if '页面不存在' in res.text:
            continue

        try:
            result = dbdsParser(detailUrl, res.text)
        except:
            writeurl2txt('data/解析错误的URL.txt',detailUrl)
        else:
            write2csv('data/豆瓣读书1030_2.csv', result)
            writeurl2txt('data/豆瓣读书存在的7位数URL.txt',detailUrl)
コード例 #15
0
ファイル: 腾讯课堂.py プロジェクト: maitian94/gaoxiao
def parse():
    allurls = get_allurl()
    print(allurls)
    print(len(allurls))
    result = {}
    for url in allurls:
        print(url)
        if 'package' not in url:
            writeurl2txt('腾讯课堂url.txt', url)
            try:
                res = requests.get(url)
                soup = bs(res.text, 'html.parser')
                result['封面图片'] = 'https://' + soup.select(
                    '.img-left--wrap img')[0].attrs['src'].strip()
                result['课程名称'] = soup.select('.title-main')[0].text.strip()
                try:
                    zuijinzaixue = soup.select(
                        '#js-statistics-apply')[0].text.strip()
                    result['最近在学人数'] = re.findall('\d+', zuijinzaixue)[0]
                    result['累计报名'] = soup.select(
                        '.js-apply-num')[0].text.strip()
                except:
                    result['购买人数'] = soup.select(
                        '#js-statistics-apply')[0].text.strip().replace(
                            '人 购买', '')
                result['好评度'] = soup.select('.rate-num')[0].text.strip()
                result['课程价格'] = soup.select(
                    '.course-price-info ')[0].text.strip().replace('¥', '')
                tnames = []
                for teacher in soup.select('.teacher-list .teacher-item'):
                    tname = teacher.select('.js-teacher-name')[0].text.strip()
                    tnames.append(tname)
                result['讲师姓名'] = ';'.join(tnames)
                result['课程介绍'] = soup.select('.tb-course td')[0].text.strip()
                result['授课机构名称'] = soup.select(
                    '.js-agency-name')[0].text.strip()
                result['机构好评度'] = soup.select(
                    '.tree-list span')[0].text.strip()
                result['机构课程数'] = soup.select(
                    '.tree-list span')[1].attrs['data-num'].strip()
                result['学习人次'] = soup.select(
                    '.tree-list span')[2].attrs['data-num'].strip()
                result['机构介绍'] = soup.select('.agency-summary')[0].text.strip()
                contacts = []
                for i in range(len(soup.select('.contact-list p'))):
                    contacts.append(
                        soup.select('.contact-list p')[i].text.strip())
                result['联系方式'] = ';'.join(contacts)
                result['页面链接'] = url
                print(result)
                write2csv('腾讯课堂.csv', [
                    result.get('页面链接', ''),
                    result.get('封面图片', ''),
                    result.get('课程名称', ''),
                    result.get('最近在学人数', ''),
                    result.get('累计报名', ''),
                    result.get('购买人数', ''),
                    result.get('好评度', ''),
                    result.get('课程价格', ''),
                    result.get('讲师姓名', ''),
                    result.get('课程介绍', ''),
                    result.get('授课机构名称', ''),
                    result.get('机构好评度', ''),
                    result.get('机构课程数', ''),
                    result.get('学习人次', ''),
                    result.get('机构介绍', ''),
                    result.get('联系方式', '')
                ])
            except Exception as e:
                print(e)
コード例 #16
0
                evaluate_dataset.append([
                    title, context, qa['question'], ground_truths[0],
                    prediction, _f1, _exact_match
                ])

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}, evaluate_dataset


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset_file', help='Dataset file', type=str)
    parser.add_argument('prediction_file', help='Prediction File', type=str)
    parser.add_argument('output', help='Export output', type=str)
    args = parser.parse_args()

    with open(args.dataset_file) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
    with open(args.prediction_file) as prediction_file:
        predictions = json.load(prediction_file)
    evaluate_info, evaluate_dataset = evaluate(dataset, predictions)
    print(json.dumps(evaluate_info))
    utils.write2csv(
        evaluate_dataset, args.output, constants.eval_header + [
            'exact match score: ' + str(evaluate_info['exact_match']),
            'f1 score: ' + str(evaluate_info['f1'])
        ])
コード例 #17
0
ファイル: 获取url.py プロジェクト: maitian94/gaoxiao
def main():
    urls = get_url()
    write2csv('香港教育文献url/香港教育url.txt', urls)
コード例 #18
0
def main():
    urls = get_href()
    for url in urls:
        result = xglgparser(url)
        write2csv('csvFiles/香港理工大学文献.csv', result)
コード例 #19
0
ファイル: barnet.py プロジェクト: trbedwards/hack4good
		bxy = locationOfBuilding(b)
		distances[i] = calcDist(pxy,bxy)
		i+=1

	return distances

if __name__ == "__main__":

	data = readCsv('data/barnet_all.csv')
	postcodes = readPostcodes(data)
	# postcodes = postcodes[0:8]
	print "loading buildings..."
	gmldoc,buildings = loadBuildings('data/TQ28.gml')
	# buildings = loadBuildings('data/barnet8.gml')
	newCsv('data/barnet_all_PV.csv')

	for i,postcode in enumerate(postcodes):

		print i,postcode

		distances = calcDists(postcode,buildings) 
		closestIndex = np.argmin(distances)
		# print "closest index = ", closestIndex
		# print distances
		closestBuilding = buildings[np.argmin(distances)]
		buildingPos = locationOfBuilding(closestBuilding)
		savings, cost, years = calculatePV(closestBuilding)
		write2csv((postcode,buildingPos[0],buildingPos[1],savings,cost,years),'data/barnet_all_PV.csv')
		print savings,cost,years
		print ""
コード例 #20
0
ファイル: 网易公开课.py プロジェクト: maitian94/gaoxiao
def parse():
    urlslist = geturl()
    print(urlslist)
    print(len(urlslist))
    for url in urlslist:
        print(url)
        try:
            if 'movie' in url:
                failedurl = []
                result = {}
                res = requests.get(url)
                soup = bs(res.text, 'html.parser')
                result['页面链接'] = soup.select(
                    '.u-ptl-c a')[0].attrs['href'].strip()
                result['图片链接'] = soup.select(
                    '.u-ptl-c img')[0].attrs['src'].strip()
                result['标题'] = soup.select('.u-ptl-c a')[1].text.strip()
                for i in range(len(soup.select('.u-ptl-c p'))):
                    label = soup.select('.u-ptl-c p')[i].text.split(
                        ':', 1)[0].strip()
                    value = soup.select('.u-ptl-c p')[i].text.split(
                        ':', 1)[1].strip()
                    result[label] = value
                driver = webdriver.PhantomJS(
                    r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe')
                driver.get(url)
                soup = bs(driver.page_source, 'html.parser')
                result['跟帖人数'] = soup.select('.tie-info a')[0].text.strip()
                result['参与人数'] = soup.select('.tie-info a')[1].text.strip()
                print(result)
                write2csv('网易公开课_movie.csv', [
                    result.get('页面链接', ''),
                    result.get('图片链接', ''),
                    result.get('标题', ''),
                    result.get('别名', ''),
                    result.get('学校', ''),
                    result.get('讲师', ''),
                    result.get('导演', ''),
                    result.get('制片国家/地区', ''),
                    result.get('集数', ''),
                    result.get('授课语言', ''),
                    result.get('类型', ''),
                    result.get('简介', ''),
                    result.get('课程简介', ''),
                    result.get('跟帖人数', ''),
                    result.get('参与人数', '')
                ])
            elif 'special' in url:
                result = {}
                print(url)
                failedurl = []
                driver = webdriver.PhantomJS(
                    r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe')
                driver.get(url)
                time.sleep(0.1)
                soup = bs(driver.page_source, 'html.parser')
                result['课程标题'] = soup.select('.m-cdes h2')[0].text.strip()
                result['图片链接'] = soup.select(
                    '.m-cintro img')[0].attrs['src'].strip()
                jishu = soup.select('.m-cdes p')[0].text.strip()
                result['集数'] = re.findall('.*?(\d+).*?', jishu)[0].strip()
                result['课程介绍'] = soup.select('.m-cdes p')[2].text.strip()
                result['讲师图片'] = soup.select(
                    '.picText img')[0].attrs['src'].strip()
                details = soup.select('.picText')
                for detail in details:
                    for i in range(len(soup.select('.picText h6'))):
                        pp = detail.select('h6')[i].text
                        if pp:
                            try:
                                label = detail.select('h6')[i].text.split(
                                    ':', 1)[0].strip()
                                value = detail.select('h6')[i].text.split(
                                    ':', 1)[1].strip()
                                result[label] = value
                            except:
                                result[label] = ''
                result['学院介绍'] = soup.select('.cContent')[0].text.strip()
                result['跟帖人数'] = soup.select('.tie-info a')[0].text.strip()
                result['参与人数'] = soup.select('.tie-info a')[1].text.strip()
                result['页面链接'] = url
                # print(result)
                write2csv('网易公开课_special.csv', [
                    result.get('页面链接', ''),
                    result.get('图片链接', ''),
                    result.get('课程标题', ''),
                    result.get('集数', ''),
                    result.get('课程介绍', ''),
                    result.get('讲师图片', ''),
                    result.get('名称', ''),
                    result.get('讲师', ''),
                    result.get('介绍', ''),
                    result.get('职业', ''),
                    result.get('学位', ''),
                    result.get('学院介绍', '')
                ])
        except Exception as e:
            print(e)
            print('special网页结构不一样')
        continue
コード例 #21
0
	args = parser.parse_args()
	
	if args.load_local:
		with open('ln.p', 'rb') as f:
			lecture_note_dataset = pickle.load(f)
	else:
		self_annot_source_data, self_annot_response_data = read_self_annot(args.self_annot)
		self_annot_dataset = build_lecture_note_dataset(self_annot_source_data, self_annot_response_data, args.data_dir, args.output, args.squash, gdrive=True, include_not_found=args.include_not_found)
		mturk_source_data = read_mturk_source(args.mturk_source)
		mturk_response_data = read_mturk_response(args.mturk_response)
		mturk_dataset = build_lecture_note_dataset(mturk_source_data, mturk_response_data, args.data_dir, args.output, args.squash, include_not_found=args.include_not_found)
		lecture_note_dataset = mturk_dataset + self_annot_dataset
		with open('ln.p', 'wb') as f:
			pickle.dump(lecture_note_dataset, f)
	if args.cross_validation_fold ==0:
		shuffle(lecture_note_dataset)
		train_dataset, dev_dataset = train_test_split(lecture_note_dataset, test_size=args.dev_size)
		utils.write2csv(lecture_note_dataset, args.output, constants.note_tsv_header)
		utils.write2csv(train_dataset, args.train_output, constants.note_tsv_header)
		utils.write2csv(dev_dataset, args.dev_output, constants.note_tsv_header)
	else:
		kf = KFold(n_splits=args.cross_validation_fold, shuffle=True)
		count = 1
		for train_index, test_index in kf.split(lecture_note_dataset):
			print("Train: ", train_index,"Test: ",test_index)
			train_dataset = [lecture_note_dataset[i] for i in train_index]
			dev_dataset = [lecture_note_dataset[i] for i in test_index]
			utils.write2csv(train_dataset, args.fold_dir +"/mturk_self_train_"+str(count)+".csv", constants.note_tsv_header)
			utils.write2csv(dev_dataset,args.fold_dir +"/mturk_self_dev_"+str(count)+".csv" , constants.note_tsv_header)
			count =count +1