def main(): urls = next_page() for url in urls: print(url) result = parser(url) print(result) write2csv('csvFiles/中药图谱.csv', result)
def main(): hrefs = next_url() for href in hrefs: result = parser(href) print(result) write2csv('csvFiles/香港教育特藏.csv', result) writeurl2txt('csvFiles/香港教育特藏.txt',href)
def parse(): result = {} for j in range(115, 163): try: url = 'http://www.xuetangx.com/courses?credential=0&page_type=0&cid=0&process=0&org=0&course_mode=0&page=' + str(j) res = requests.get(url) print(url) soup = bs(res.text, 'html.parser') for i in range(len(soup.select('#list_style .list_inner'))): detail_url = 'http://www.xuetangx.com' + soup.select('.img a')[i].attrs['href'].strip() result['详情页面链接'] = detail_url result['封面图片'] = 'http://www.xuetangx.com' + soup.select('.img img')[i].attrs['src'].strip() result['课程标题'] = soup.select('.coursetitle')[i].text.strip() try: try: result['所属学科'] = ';'.join([soup.select('.coursename_ref')[i].select('.subject')[0].text.strip(),soup.select('.coursename_ref')[i].select('.subject')[1].text.strip()]) except: result['所属学科'] = soup.select('.coursename_ref')[i].select('.subject')[0].text.strip() except: result['所属学科'] = '' try: result['简介'] = soup.select('.txt_all .txt')[i].text.strip().replace('简介', '').replace('\t', '').replace( '\r\n', '').replace('\n', '') except: result['简介'] = soup.select('.txt_all .ktxt')[i].text.strip().replace('简介', '').replace('\n', '') driver = webdriver.PhantomJS(r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe') driver.get(detail_url) time.sleep(0.1) soup1 = bs(driver.page_source, 'html.parser') result['课程来源'] = soup1.select('.courseabout_text a')[0].text.strip() result['课程描述'] = soup1.select('.course_intro .text')[0].text.strip() result['开课时间'] = soup1.select('.illustrate span span')[0].text.strip().replace('.', '-') result['结课时间'] = soup1.select('.illustrate span span')[1].text.strip().replace('.', '-') result['报名人数'] = soup1.select('.illustrate span span')[5].text.strip() teachers = soup1.select('.teacher_info .cf') teacher_info = [] for teacher in teachers: one_teacher = ','.join([teacher.select('.teacher_text span')[0].text.strip(), teacher.select('.teacher_text span')[1].text.strip()]) teacher_info.append(one_teacher) result['教师信息'] = ';'.join(teacher_info) print(result) write2csv('学堂在线.csv',[ result.get('详情页面链接', ''), result.get('封面图片', ''), result.get('课程标题', ''), result.get('所属学科', ''), result.get('简介', ''), result.get('课程来源', ''), result.get('课程描述', ''), result.get('开课时间', ''), result.get('结课时间', ''), result.get('报名人数', ''), result.get('教师信息', '') ]) except Exception as e: print(e) print('*********网页犯病了*********') continue
def parse(): all_urls = allUrls() print(all_urls) for url in all_urls: print(url) time.sleep(1.2) try: result = {} res = requests.get(url) soup = bs(res.text,'html.parser') # print(soup) for i in range(1,len(soup.select('.tblBrow')[0].select('td')),2): label = soup.select('.tblBrow')[0].select('td')[i].text.strip() value = soup.select('.tblBrow')[0].select('td')[i+1].text.replace('\r\n','').replace('\t','').replace('相关图书','').replace('\n','').replace('页','').strip() result[label] = value prizes = re.findall('¥(.*?)折扣价:¥(.*?)折扣:(.*?)节.*?',result['定价:']) result['定价:'] = prizes[0][0].strip() result['折扣价'] = prizes[0][1].strip() result['折扣'] = prizes[0][2].strip() result['图片'] = 'http://www.sinobook.com.cn' + soup.select('.tblBrow img')[0].attrs['src'] for j in range(len(soup.select('.tblBrow')[1].select('.tdCaptionD'))): label = soup.select('.tblBrow')[1].select('.tdCaptionD')[j].text.strip() value = soup.select('.tblBrow')[1].select('.Text')[j].text.replace('\r\n','').replace('\t','').replace('\n','').replace('�','').strip() result[label] = value result['分类'] = '其他' result['页面链接'] = url # return result print(result) write2csv('中国高校教材图书网131313.csv', [ result.get('页面链接', ''), result.get('书名:', ''), result.get('图片', ''), result.get('分类', ''), result.get('ISBN:', ''), result.get('条码:', ''), result.get('作者:', ''), result.get('装订:', ''), result.get('印次:', ''), result.get('开本:', ''), result.get('定价:', ''), result.get('折扣', ''), result.get('折扣价', ''), result.get('字数:', ''), result.get('出版社:', ''), result.get('页数:', ''), result.get('发行编号:', ''), result.get('每包册数:', ''), result.get('出版日期:', ''), result.get('内容简介:', ''), result.get('作者简介:', ''), result.get('章节目录:', ''), result.get('精彩片段:', ''), result.get('书\u3000\u3000评:',''), result.get('其\u3000\u3000它:', '') ]) except Exception as e: print(e) print('*********网页犯病了*********') continue
def parser(): result = {} for z in range(28, 30): next_url = 'http://idp.nlc.cn/database/search_results.a4d?uid=-9761261559;bst=' + str( 1 + z * 50) res = requests.get(next_url) soup = bs(res.text, 'html.parser') trs = soup.select('#results tr') print('正在处理第*******' + str(z) + '*********页') for tr in trs: picture_detail_url = 'http://idp.nlc.cn/database/' + tr.select( '.thumb a')[0].attrs['href'].strip() result['图片详情页链接'] = picture_detail_url picture_url = 'http://idp.nlc.cn' + tr.select( 'img')[0].attrs['src'].strip() result['图片链接'] = picture_url detail_url = 'http://idp.nlc.cn/database/' + tr.select( '.resultdetails a')[0].attrs['href'].strip() result['详情页链接'] = detail_url # institution = tr.select('.resultdetails a')[0].text.strip() year = tr.select('.resultdetails a')[1].text.strip() result['未知信息'] = year details = tr.select('.resultdetails')[0].text.strip().replace( '\n', ' ').replace('\t', '') # yizhi = re.findall('.*?遺址:(.*?)語言/.*?',details)[0].strip() language = re.findall('.*?語言/文字: (.*?) 材料:.*?', details)[0].strip() result['语言'] = language # material = re.sub('.*?材料:','',details).strip() try: res1 = requests.get(picture_detail_url, timeout=75) except: failed_urls = [] failed_urls.append(picture_detail_url) writeurl2txt('failedurl.txt', picture_detail_url) continue soup1 = bs(res1.text, 'html.parser') duis = soup1.select('#iteminfotable tr') print(1111111111) for dui in duis: label = dui.select('td')[0].text.strip() value = dui.select('td')[1].text.strip() result[label] = value print(222222222) print(result) write2csv('敦煌国际项目.csv', [ result.get('图片详情页链接', ''), result.get('图片链接', ''), result.get('详情页链接', ''), result.get('未知信息', ''), result.get('语言', ''), result.get('收藏機構及版權:', ''), result.get('遺址:', ''), result.get('藏品形態:', ''), result.get('材料:', ''), result.get('尺寸 (h x w) 釐米:', '') ])
def parse(): result = {} for i in range(14742, 72357): # for i in range(5,25): url = 'http://apabi.szlib.com/Product2.asp?lang=gb&type=&DocGroupID=2&DocID=' + str( i) try: res = requests.get(url) soup = bs(res.text, 'html.parser') picture_url = soup.select( 'html body tr img')[0].attrs['src'].strip() except Exception as e: print(e) continue result['页面链接'] = url result['图片链接'] = picture_url trs = soup.select('html body tr table')[2].select('tr') for tr in trs: label = tr.select('td')[0].text.strip() value = tr.select('td')[1].text.replace('\n', '').replace( '\t', '').replace('\r', '').strip() result[label] = value print(result) write2csv('data/阿帕比电子图书.csv', [ result.get('页面链接', ''), result.get('其它题名', ''), result.get('书名', ''), result.get('图片链接', ''), result.get('责任者', ''), result.get('主要责任关系', ''), result.get('主题/关键词', ''), result.get('摘要', ''), result.get('出版社', ''), result.get('出版地', ''), result.get('出版日期', ''), result.get('标识', ''), result.get('标识类型', ''), result.get('价格', ''), result.get('纸书价格', ''), result.get('责任编辑', ''), result.get('版次', ''), result.get('印次', ''), result.get('字数(千字)', ''), result.get('中图法分类号', ''), result.get('ISBN号', ''), result.get('附注', ''), result.get('外币价格', ''), result.get('相关文献与本文献的联系', ''), result.get('次要责任者', ''), result.get('次要责任关系', ''), result.get('Apabi分类号', ''), ])
def parse(): urls = getUrl() for url in urls: try: result = {} res = requests.get(url) soup = bs(res.text,'html.parser') result['页面链接'] = url result['标题'] = soup.select('.course-detail__title')[0].text.strip() result['封面图片'] = soup.select('.course-detail-img img')[0].attrs['src'].strip() result['分类'] = soup.select('.breadcrumb-o li')[1].text.strip() result['价格'] = soup.select('.course-detail__price')[0].text.strip() try: result['课程来源'] = soup.select('.gray-dark')[0].text.strip() result['参与人数'] = soup.select('.gray-dark')[2].text.strip().replace('人已参与','') except: result['课程来源'] = '' result['参与人数'] = '' result['开课时间'] =soup.select('.panel-body p')[0].text.strip().replace('开始:','') result['结课时间'] = soup.select('.panel-body p')[1].text.strip().replace('截止:', '') details = soup.select('.es-piece') for i in range(len(soup.select('.es-piece'))): label = details[i].select('.piece-header')[0].text.strip() value = details[i].select('.piece-body')[0].text.strip().replace('\n','').replace('\xa0','').replace('\r','') result[label] = value result['开课时间'] = result['开课时间'].replace('/','-') result['结课时间'] = result['结课时间'].replace('/', '-') names = [] for j in range(len(soup.select('.row .media-body .link-dark'))): names.append(soup.select('.row .media-body .link-dark')[j].text.strip()) result['教学老师'] = ';'.join(names) print(result) write2csv('中国高校外语慕课平台.csv',[ result.get('页面链接', ''), result.get('标题',''), result.get('封面图片', ''), result.get('分类', ''), result.get('价格', ''), result.get('课程来源', ''), result.get('参与人数', ''), result.get('开课时间', ''), result.get('结课时间', ''), result.get('课程概述', ''), result.get('课程介绍', ''), result.get('课程目标', ''), result.get('适合人群', ''), result.get('教学老师', '') ]) except Exception as e: print(e) print('*********网页犯病了*********') continue
def test(): test_data = get_test_data() x = test_data[0] y = test_data[1] # Recreate the model. model = DeepSEA() model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9), loss=tf.keras.losses.BinaryCrossentropy()) model.build(input_shape=(None, 1000, 4)) model.summary() # Load the weights of the old model. (The weights content the weights of model and status of optimizer.) # Because the tensorflow delay the creation of variables in model and optimizer, so the optimizer status will # be restored when the model is trained first. like: model.train_on_batch(x[0:1], y[0:1]) model.load_weights('./result/model/ckpt') # model.load_weights('./result/model/bestmodel.h5') result = model.predict(x) # shape = (455024, 919) np.savez('./result/test_result.npz', result=result, label=y) result = np.mean((result[0:227512], result[227512:]), axis=0) result_shape = np.shape(result) y = y[0:227512] fpr_list, tpr_list, auroc_list = [], [], [] precision_list, recall_list, aupr_list = [], [], [] for i in tqdm(range(result_shape[1]), ascii=True): fpr_temp, tpr_temp, auroc_temp = calculate_auroc(result[:, i], y[:, i]) precision_temp, recall_temp, aupr_temp = calculate_aupr( result[:, i], y[:, i]) fpr_list.append(fpr_temp) tpr_list.append(tpr_temp) precision_list.append(precision_temp) recall_list.append(recall_temp) auroc_list.append(auroc_temp) aupr_list.append(aupr_temp) plot_roc_curve(fpr_list, tpr_list, './result/') plot_pr_curve(precision_list, recall_list, './result/') header = np.array([['auroc', 'aupr']]) content = np.stack((auroc_list, aupr_list), axis=1) content = np.concatenate((header, content), axis=0) write2csv(content, './result/result.csv') write2txt(content, './result/result.txt') avg_auroc = np.nanmean(auroc_list) avg_aupr = np.nanmean(aupr_list) print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
def main(): url = 'http://mylib.nlc.cn/web/guest/search/searchresult?p_p_id=010403_WAR_system&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_010403_WAR_system_struts.portlet.action=%2Fsecondarysearch%2Fsecondarysearch%2Fconditionssearch&_010403_WAR_system_struts.portlet.mode=view' for i in range(2,16): html = nexturl(url, i) print('正在爬取第%d页' % i) soup = bs(html,'html.parser') for j in range(len(soup.select('.result_item_first a'))): onepage_url = soup.select('.result_item_first a')[j].attrs['href'].strip() try: result = parser(onepage_url) except: time.sleep(8) result = parser(onepage_url) print(result) write2csv('国外汉学家1.csv', result)
def parser(): result = {} ziduans = [ '项目批准号', '项目类别', '学科分类', '项目名称', '立项时间', '项目负责人', '专业职务', '工作单位', '单位类别', '所在省区市', '所属系统', '成果名称', '成果形式', '成果等级', '结项时间', '结项证书号', '出版社', '出版时间', '作者', '获奖情况' ] for z in range(1, 3303): try: nexturl = 'http://fz.people.com.cn/skygb/sk/index.php/Index/seach?&p=' + str( z) except: time.sleep(5) nexturl = 'http://fz.people.com.cn/skygb/sk/index.php/Index/seach?&p=' + str( z) print('**********正在打印第' + str(z) + '页***********') res = requests.post(nexturl) soup = bs(res.text, 'html.parser') tds = soup.select('.jc_a td') for i in range(0, len(tds), 20): for j in range(20): result[ziduans[j]] = tds[i + j].text.strip() print(result) write2csv('csvFiles/国家社科基金.csv', [ result.get('项目批准号', ''), result.get('项目类别', ''), result.get('学科分类', ''), result.get('项目名称', ''), result.get('立项时间', ''), result.get('项目负责人', ''), result.get('专业职务', ''), result.get('工作单位', ''), result.get('单位类别', ''), result.get('所在省区市', ''), result.get('所属系统', ''), result.get('成果名称', ''), result.get('成果形式', ''), result.get('成果等级', ''), result.get('结项时间', ''), result.get('结项证书号', ''), result.get('出版社', ''), result.get('出版时间', ''), result.get('作者', ''), result.get('获奖情况', '') ])
def test(): dataset_test = get_test_data(64) model = DanQ() loss_object = keras.losses.BinaryCrossentropy() optimizer = keras.optimizers.Adam() trainer = Trainer(model=model, loss_object=loss_object, optimizer=optimizer, experiment_dir='./result/DanQ') result, label = trainer.test(dataset_test, test_steps=int(np.ceil(455024 / 64)), dis_show_bar=True) result = np.mean((result[0:227512], result[227512:]), axis=0) result_shape = np.shape(result) label = label[0:227512] fpr_list, tpr_list, auroc_list = [], [], [] precision_list, recall_list, aupr_list = [], [], [] for i in tqdm(range(result_shape[1]), ascii=True): fpr_temp, tpr_temp, auroc_temp = calculate_auroc( result[:, i], label[:, i]) precision_temp, recall_temp, aupr_temp = calculate_aupr( result[:, i], label[:, i]) fpr_list.append(fpr_temp) tpr_list.append(tpr_temp) precision_list.append(precision_temp) recall_list.append(recall_temp) auroc_list.append(auroc_temp) aupr_list.append(aupr_temp) plot_roc_curve(fpr_list, tpr_list, './result/DanQ/') plot_pr_curve(precision_list, recall_list, './result/DanQ/') header = np.array([['auroc', 'aupr']]) content = np.stack((auroc_list, aupr_list), axis=1) content = np.concatenate((header, content), axis=0) write2csv(content, './result/DanQ/result.csv') write2txt(content, './result/DanQ/result.txt') avg_auroc = np.nanmean(auroc_list) avg_aupr = np.nanmean(aupr_list) print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
def main(): url = 'http://mylib.nlc.cn/web/guest/zhonghuagujishanben?p_p_id=010453_WAR_system&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-3&p_p_col_pos=1&p_p_col_count=2&_010453_WAR_system_struts.portlet.action=%2Fsecondarysearch%2Fsecondarysearch%2FzhonghuagujiInitSearch&_010453_WAR_system_struts.portlet.mode=view' for i in range(325, 350): print('正在爬取第%d页' % i) html = nexturl(url, i) # print(html) #获得当前页面所有详情页的链接 soup = bs(html, 'html.parser') for j in range(len(soup.select('.result_item_img a'))): onepage_url = soup.select( '.result_item_img a')[j].attrs['href'].strip() try: result = parser(onepage_url) except: time.sleep(8) result = parser(onepage_url) print(result) write2csv('中华古籍善本3.csv', result)
def parse_data(self, data): results = data['searchResults']['results'] for result in results: item = dict() item['id'] = result['id'] item['companyName'] = result['companyName'] item['primaryUrl'] = result['primaryUrl'] item['phone'] = result['phone'] for i in range(len(result['addresses'])): if result['addresses'][i].get('country').get('name') is None: item[f'country{i}'] = '' else: item[f'country{i}'] = result['addresses'][i].get( 'country').get('name') if result['addresses'][i].get('state').get('name') is None: item[f'state{i}'] = '' else: item[f'state{i}'] = result['addresses'][i].get( 'state').get('name') if result['addresses'][i].get('city') is None: item[f'city{i}'] = '' else: item[f'city{i}'] = result['addresses'][i].get('city') if result['addresses'][i].get('address1') is None: item[f'address{i}'] = '' else: item[f'address{i}'] = result['addresses'][i].get( 'address1') item[f'address-{i}'] = item[f'country{i}'] + ' ' + item[f'state{i}'] + ' ' + item[f'city{i}'] + ' ' + \ item[f'address{i}'] item['companyType'] = result['ownershipType'].get( 'name') + ' ' + result['entityType'].get('name') item['industry'] = result['industry'].get('shortDescription') print(item) write2csv(self.payload.get('query'), item, fieldnames=self.fieldnames)
def main(rds): # 从rds里取详情页url,请求 并 解析 ip = getIP() flag = 1 while flag: detailUrl = rds.spop('dbds') if not detailUrl: flag = 0 try: res = requests.get(url=detailUrl, proxies={'https': ip}, verify=False) # time.sleep(1) except Exception as e: rds.sadd('dbds', detailUrl) ip = getIP() if not ip: sys.exit('IP用完了') print(f'请求出错,错误原因:\n{e}已更换IP:{ip}') logging.info(f'请求出错,错误原因:[{e}],链接:{detailUrl}') continue if '检测到有异常' in res.text: ip = getIP() if not ip: sys.exit('IP用完了') print('检测到IP有异常,已更换IP:', ip) rds.sadd('dbds', detailUrl) if '页面不存在' in res.text: continue try: result = dbdsParser(detailUrl, res.text) except: writeurl2txt('data/解析错误的URL.txt',detailUrl) else: write2csv('data/豆瓣读书1030_2.csv', result) writeurl2txt('data/豆瓣读书存在的7位数URL.txt',detailUrl)
def parse(): allurls = get_allurl() print(allurls) print(len(allurls)) result = {} for url in allurls: print(url) if 'package' not in url: writeurl2txt('腾讯课堂url.txt', url) try: res = requests.get(url) soup = bs(res.text, 'html.parser') result['封面图片'] = 'https://' + soup.select( '.img-left--wrap img')[0].attrs['src'].strip() result['课程名称'] = soup.select('.title-main')[0].text.strip() try: zuijinzaixue = soup.select( '#js-statistics-apply')[0].text.strip() result['最近在学人数'] = re.findall('\d+', zuijinzaixue)[0] result['累计报名'] = soup.select( '.js-apply-num')[0].text.strip() except: result['购买人数'] = soup.select( '#js-statistics-apply')[0].text.strip().replace( '人 购买', '') result['好评度'] = soup.select('.rate-num')[0].text.strip() result['课程价格'] = soup.select( '.course-price-info ')[0].text.strip().replace('¥', '') tnames = [] for teacher in soup.select('.teacher-list .teacher-item'): tname = teacher.select('.js-teacher-name')[0].text.strip() tnames.append(tname) result['讲师姓名'] = ';'.join(tnames) result['课程介绍'] = soup.select('.tb-course td')[0].text.strip() result['授课机构名称'] = soup.select( '.js-agency-name')[0].text.strip() result['机构好评度'] = soup.select( '.tree-list span')[0].text.strip() result['机构课程数'] = soup.select( '.tree-list span')[1].attrs['data-num'].strip() result['学习人次'] = soup.select( '.tree-list span')[2].attrs['data-num'].strip() result['机构介绍'] = soup.select('.agency-summary')[0].text.strip() contacts = [] for i in range(len(soup.select('.contact-list p'))): contacts.append( soup.select('.contact-list p')[i].text.strip()) result['联系方式'] = ';'.join(contacts) result['页面链接'] = url print(result) write2csv('腾讯课堂.csv', [ result.get('页面链接', ''), result.get('封面图片', ''), result.get('课程名称', ''), result.get('最近在学人数', ''), result.get('累计报名', ''), result.get('购买人数', ''), result.get('好评度', ''), result.get('课程价格', ''), result.get('讲师姓名', ''), result.get('课程介绍', ''), result.get('授课机构名称', ''), result.get('机构好评度', ''), result.get('机构课程数', ''), result.get('学习人次', ''), result.get('机构介绍', ''), result.get('联系方式', '') ]) except Exception as e: print(e)
evaluate_dataset.append([ title, context, qa['question'], ground_truths[0], prediction, _f1, _exact_match ]) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total return {'exact_match': exact_match, 'f1': f1}, evaluate_dataset if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('dataset_file', help='Dataset file', type=str) parser.add_argument('prediction_file', help='Prediction File', type=str) parser.add_argument('output', help='Export output', type=str) args = parser.parse_args() with open(args.dataset_file) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] with open(args.prediction_file) as prediction_file: predictions = json.load(prediction_file) evaluate_info, evaluate_dataset = evaluate(dataset, predictions) print(json.dumps(evaluate_info)) utils.write2csv( evaluate_dataset, args.output, constants.eval_header + [ 'exact match score: ' + str(evaluate_info['exact_match']), 'f1 score: ' + str(evaluate_info['f1']) ])
def main(): urls = get_url() write2csv('香港教育文献url/香港教育url.txt', urls)
def main(): urls = get_href() for url in urls: result = xglgparser(url) write2csv('csvFiles/香港理工大学文献.csv', result)
bxy = locationOfBuilding(b) distances[i] = calcDist(pxy,bxy) i+=1 return distances if __name__ == "__main__": data = readCsv('data/barnet_all.csv') postcodes = readPostcodes(data) # postcodes = postcodes[0:8] print "loading buildings..." gmldoc,buildings = loadBuildings('data/TQ28.gml') # buildings = loadBuildings('data/barnet8.gml') newCsv('data/barnet_all_PV.csv') for i,postcode in enumerate(postcodes): print i,postcode distances = calcDists(postcode,buildings) closestIndex = np.argmin(distances) # print "closest index = ", closestIndex # print distances closestBuilding = buildings[np.argmin(distances)] buildingPos = locationOfBuilding(closestBuilding) savings, cost, years = calculatePV(closestBuilding) write2csv((postcode,buildingPos[0],buildingPos[1],savings,cost,years),'data/barnet_all_PV.csv') print savings,cost,years print ""
def parse(): urlslist = geturl() print(urlslist) print(len(urlslist)) for url in urlslist: print(url) try: if 'movie' in url: failedurl = [] result = {} res = requests.get(url) soup = bs(res.text, 'html.parser') result['页面链接'] = soup.select( '.u-ptl-c a')[0].attrs['href'].strip() result['图片链接'] = soup.select( '.u-ptl-c img')[0].attrs['src'].strip() result['标题'] = soup.select('.u-ptl-c a')[1].text.strip() for i in range(len(soup.select('.u-ptl-c p'))): label = soup.select('.u-ptl-c p')[i].text.split( ':', 1)[0].strip() value = soup.select('.u-ptl-c p')[i].text.split( ':', 1)[1].strip() result[label] = value driver = webdriver.PhantomJS( r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe') driver.get(url) soup = bs(driver.page_source, 'html.parser') result['跟帖人数'] = soup.select('.tie-info a')[0].text.strip() result['参与人数'] = soup.select('.tie-info a')[1].text.strip() print(result) write2csv('网易公开课_movie.csv', [ result.get('页面链接', ''), result.get('图片链接', ''), result.get('标题', ''), result.get('别名', ''), result.get('学校', ''), result.get('讲师', ''), result.get('导演', ''), result.get('制片国家/地区', ''), result.get('集数', ''), result.get('授课语言', ''), result.get('类型', ''), result.get('简介', ''), result.get('课程简介', ''), result.get('跟帖人数', ''), result.get('参与人数', '') ]) elif 'special' in url: result = {} print(url) failedurl = [] driver = webdriver.PhantomJS( r'D:\Wangyuanyuan\工作\爬虫\phantomjs.exe') driver.get(url) time.sleep(0.1) soup = bs(driver.page_source, 'html.parser') result['课程标题'] = soup.select('.m-cdes h2')[0].text.strip() result['图片链接'] = soup.select( '.m-cintro img')[0].attrs['src'].strip() jishu = soup.select('.m-cdes p')[0].text.strip() result['集数'] = re.findall('.*?(\d+).*?', jishu)[0].strip() result['课程介绍'] = soup.select('.m-cdes p')[2].text.strip() result['讲师图片'] = soup.select( '.picText img')[0].attrs['src'].strip() details = soup.select('.picText') for detail in details: for i in range(len(soup.select('.picText h6'))): pp = detail.select('h6')[i].text if pp: try: label = detail.select('h6')[i].text.split( ':', 1)[0].strip() value = detail.select('h6')[i].text.split( ':', 1)[1].strip() result[label] = value except: result[label] = '' result['学院介绍'] = soup.select('.cContent')[0].text.strip() result['跟帖人数'] = soup.select('.tie-info a')[0].text.strip() result['参与人数'] = soup.select('.tie-info a')[1].text.strip() result['页面链接'] = url # print(result) write2csv('网易公开课_special.csv', [ result.get('页面链接', ''), result.get('图片链接', ''), result.get('课程标题', ''), result.get('集数', ''), result.get('课程介绍', ''), result.get('讲师图片', ''), result.get('名称', ''), result.get('讲师', ''), result.get('介绍', ''), result.get('职业', ''), result.get('学位', ''), result.get('学院介绍', '') ]) except Exception as e: print(e) print('special网页结构不一样') continue
args = parser.parse_args() if args.load_local: with open('ln.p', 'rb') as f: lecture_note_dataset = pickle.load(f) else: self_annot_source_data, self_annot_response_data = read_self_annot(args.self_annot) self_annot_dataset = build_lecture_note_dataset(self_annot_source_data, self_annot_response_data, args.data_dir, args.output, args.squash, gdrive=True, include_not_found=args.include_not_found) mturk_source_data = read_mturk_source(args.mturk_source) mturk_response_data = read_mturk_response(args.mturk_response) mturk_dataset = build_lecture_note_dataset(mturk_source_data, mturk_response_data, args.data_dir, args.output, args.squash, include_not_found=args.include_not_found) lecture_note_dataset = mturk_dataset + self_annot_dataset with open('ln.p', 'wb') as f: pickle.dump(lecture_note_dataset, f) if args.cross_validation_fold ==0: shuffle(lecture_note_dataset) train_dataset, dev_dataset = train_test_split(lecture_note_dataset, test_size=args.dev_size) utils.write2csv(lecture_note_dataset, args.output, constants.note_tsv_header) utils.write2csv(train_dataset, args.train_output, constants.note_tsv_header) utils.write2csv(dev_dataset, args.dev_output, constants.note_tsv_header) else: kf = KFold(n_splits=args.cross_validation_fold, shuffle=True) count = 1 for train_index, test_index in kf.split(lecture_note_dataset): print("Train: ", train_index,"Test: ",test_index) train_dataset = [lecture_note_dataset[i] for i in train_index] dev_dataset = [lecture_note_dataset[i] for i in test_index] utils.write2csv(train_dataset, args.fold_dir +"/mturk_self_train_"+str(count)+".csv", constants.note_tsv_header) utils.write2csv(dev_dataset,args.fold_dir +"/mturk_self_dev_"+str(count)+".csv" , constants.note_tsv_header) count =count +1