def main5(): '获取个人信息页' f5 = open('personal_info.csv', 'w+', newline='', encoding='gb18030') csv_write5 = csv.writer(f5) # f6=open('personal_info.txt','w+',encoding='utf-8') for i in range(1, 13): print(i) url = 'http://zupu.syshenshi.com/ZR.aspx?pageindex=' + str( i ) + '&infoname=&mobile=&infotype=&infotype_name=&industry=&address=&iswaiqian=0&fuqin=&muqin=&peiou=&zinv=&muyuandizhi=&waiqiandizhi=&chushengriqi_begin=&chushengriqi_end=&qushiriqi_begin=&qushiriqi_end=&waiqian_begin=&waiqian_end=' try: bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(10) try: bsObj = mytemp.getObj(url) trlist = bsObj.find( 'table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(10) bsObj = mytemp.getObj(url) trlist = bsObj.find( 'table', class_='stdtable').find('tbody').find_all('tr') for tr in trlist: tdlist = tr.find_all('td') row = [] for td in tdlist: if td.find('a') != None: row.append(td.find('a').attrs['href']) row.append(td.get_text()) # f6.write(str(row)+'\n') csv_write5.writerow(row)
def getLink(): f1 = open(filename + '.csv', 'w+', newline='', encoding='gb18030') csv_write = csv.writer(f1) for i in range(1, 7): newurl = url + str(i) bsObj = mytemp.getObj(newurl).find('div', class_='sousuoListBox clearfix') divlist = bsObj.find_all('div', class_='ssCardItem') print(len(divlist)) # return for div in divlist: a = div.find('a', class_='siteCardICH3') title = a.attrs['title'].replace(',', '_') href = a.attrs['href'] keylist = div.find( 'div', class_='siteCardFLabelBox siteIlB_box').find_all('a') keySum = len(keylist) haveGet = div.find('div', class_='ftDiv').find('p', class_='ftP').get_text() support = div.find('div', class_='scDiv').find('p', class_='ftP').get_text() jindu = div.find('div', class_='thDiv').find('p', class_='ftP').get_text() row = [filename, title, href, keySum, haveGet, support, jindu] print(row) csv_write.writerow(row)
def get_detail(line): bs_obj=mytemp.getObj(line[1],True) img_link=bs_obj.find('div',class_='tab-pane').find('img').attrs['src'] weight='' try: weight_list=bs_obj.find('div',class_='attributes-item mod-info kuajing-attribues').find_all('span') for w in weight_list: weight_content=w.get_text() if weight_content.find('产品净重')!=-1: weight=weight_content.replace('产品净重','').replace('\n','') break except: csv_write1.writerow(line) print(line) #寻找规格 guige='' td_list=bs_obj.find('div',{'id':'mod-detail-attributes'}).find_all('td') for m in range(0,len(td_list)): # print(td_list[m].get_text()) if td_list[m].get_text().find('规格')!=-1: guige=td_list[m+1].get_text() break row=line+[img_link,weight,guige] #价格和数量 price_list=bs_obj.find('div',{'id':'mod-detail-price'}) price_td_list=price_list.find('tr',class_='price').find_all('td') amount_td_list=price_list.find('tr',class_='amount').find_all('td') for n in range(1,len(price_td_list)): row.append(price_td_list[n].get_text()+' '+amount_td_list[n].get_text()) print(row) csv_write.writerow(row)
def get_detail(): f2=open('wedding_final.csv','a+',encoding='gb18030',newline='') csv_write2=csv.writer(f2) f3=open('wedding_error1.csv','w+',encoding='gb18030',newline='') csv_write3=csv.writer(f3) for line in open('wedding_error.csv','r',encoding='gb18030'): line=line.split(',') if line[0][0]=='"': continue # print(line) url=str('https://m'+line[1][11:]).strip().replace('\n','') print(url) # time.sleep(3) bsObj=mytemp.getObj(url,False,cook) # print(bsObj) # print(bsObj.find('body',class_='shop-details')) # jo=bsObj.find('textarea',{'id':'shop-detail'}).get_text() # print(jo) # break try: address=bsObj.find('div',class_='J_address').get_text().strip() phone=bsObj.find('div',class_='J_phone').get_text().strip().replace(' ','')+'\t' # print('daoda') except: csv_write3.writerow(line) print(line) continue row=line+[address,phone] print(row) csv_write2.writerow(row)
def main3(ty): #个人信息页写入 f_detail = open('P_detail' + str(ty) + '.txt', 'w+', newline='', encoding='utf-8') # f3=open('82.txt','w+',enc) for line in open('p_name' + str(ty) + '.txt', 'r', encoding='utf-8'): line = eval(line) # print(line) if line[0] == '0': continue # break url = 'http://zupu.syshenshi.com/zuren_detail.aspx?id=' + line[0] print(url) try: bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(20) try: bsObj = mytemp.getObj(url) trlist = bsObj.find( 'table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(20) bsObj = mytemp.getObj(url) trlist = bsObj.find( 'table', class_='stdtable').find('tbody').find_all('tr') row = line + [] row1 = [] for tr in trlist: tdlist3 = tr.find_all('td', class_='center') for td in tdlist3: row.append(td.get_text().replace(" ", '')) tdlist = tr.find_all('td', class_='left') for td in tdlist: row.append(td.get_text().replace(" ", '')) if len(row) == 31: row = row[:26] + ['', ''] + row[26:] f_detail.write(str(row) + '\n') f_detail.close()
def main7(id, ty, page): line = [] for line in open('./page' + str(page) + '/p_name' + str(ty) + '.txt', 'r', encoding='utf-8'): line = eval(line) if line[0] == id: print('right') break if line[0] != id: raise id print(line[0]) url = 'http://zupu.syshenshi.com/zuren_detail.aspx?id=' + line[0] print(url) try: bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(20) try: bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') except: time.sleep(20) bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') row = line + [] for tr in trlist: tdlist3 = tr.find_all('td', class_='center') for td in tdlist3: row.append(td.get_text().replace(" ", '')) tdlist = tr.find_all('td', class_='left') for td in tdlist: row.append(td.get_text().replace(" ", '')) if len(row) == 31: row = row[:26] + ['', ''] + row[26:] print(row) return row
def get_detail(): f2 = open('wh_xyk_final.csv', 'w+', encoding='gb18030', newline='') csv_write2 = csv.writer(f2) for line in csv.reader(open('wh_xyk_link.csv', 'r', encoding='gb18030')): url = 'http://wh.bqqm.com' + line[1] bsObj = mytemp.getObj(url) phone = bsObj.find('div', class_='telli').find('span', class_='p').get_text() row = [line[0], url, phone] print(row) csv_write2.writerow(row)
def getDetail(line): url = line[2] bsObj = mytemp.getObj(url) if bsObj == None: print(url) f3.write('url' + '\n') return try: target = bsObj.find( 'div', class_='xqRatioText clearfix').find('b').get_text() except: print(url) f3.write('url' + '\n') return concern = bsObj.find( 'div', class_='xqDetailLeft siteImgBox').find('a').get_text() refresh = bsObj.find('li', { 'data-scrollto': 'zxjzBox' }).find('b').get_text() comment = bsObj.find('li', { 'data-scrollto': 'plOuterBox' }).find('b').get_text() supportTime = bsObj.find('li', { 'data-scrollto': 'zczOuterBox' }).find('b').get_text() try: footlist = bsObj.find('div', class_='zcjeOuterBox').find_all( 'div', class_='zcjeFooter') except: print(url) f3.write('url' + '\n') return l = len(footlist) reTime = footlist[l - 1].find_all('b') reTime = reTime[len(reTime) - 1].get_text() if bsObj.find('div', {'id': 'xmxqBox'}).find('img') != None: pic = '有' else: pic = '无' if bsObj.find('div', class_='play-box') != None: video = '有' else: video = '无' row = line + [ target, concern, refresh, comment, supportTime, reTime, pic, video ] print(row) csv_write2.writerow(row)
def get_link(): f1=open('dzdp_wedding.csv','w+',encoding='gb18030',newline='') csv_write1=csv.writer(f1) for i in range(1, 51): url = urlroot1 + str(i) + urlroot2 bsObj=mytemp.getObj(url,False,cook) lilist=bsObj.find_all('div',class_='txt') print(len(lilist)) for li in lilist: a=li.find('div',class_='tit').find('a') href=a.attrs['href'] title=a.find('h4').get_text() row=[title,href] print(row) csv_write1.writerow(row)
def getcity(): f2 = open('city.csv', 'w+', encoding='gb18030', newline='') csv_write2 = csv.writer(f2) url = 'http://www.to8to.com/index.html' bsObj = mytemp.getObj(url, False, cook) divbox = bsObj.find('div', {'id': 'city_box'}) citybox = divbox.find_all('div', class_='cs_zs') for box in citybox: print(box) citylist = box.find('div', class_='xzcs_dt').find_all('a') for city in citylist: href = city.attrs['href'] cityname = city.get_text().strip() row = [cityname, href] print(row) csv_write2.writerow(row)
def getDetail(url): url='https://www.enf.com.cn'+url print(url) bsObj=mytemp.getObj(url,True) div=bsObj.find('div',class_='enf-company-profile-info-main pull-left') h1=div.find('h1',class_='blue-title').get_text().replace('\n','').strip() try: email=div.find('td',itemprop='email').find('a').get_text() except: email='' href=div.find('a',itemprop='url').attrs['title'] tablelist=div.find('div',class_='enf-company-profile-info-main-spec position-relative').find_all('table') l=len(tablelist) address=tablelist[l-1].find_all('td')[1].get_text() row=[url,h1,email,href,address] csv_write.writerow(row) print(row)
def main1(): f=open('qd1.csv','w+',newline='',encoding='gb18030') csv_write=csv.writer(f) #708 for i in range(0,708): newUrl=urlRoot+str(i)+'.html' print('正在爬取第'+str(i)+'页...') lilist=mytemp.getObj(newUrl,cookie).find('div',class_='post-list').find_all('li',class_='post-item clearfix') print(len(lilist)) for li in lilist: a=li.find('h2').find_all('a')[-1] a1=li.find('span',class_='author').find_all('a')[-1] if 'href' not in a.attrs: print('position error') else: wr=[a['href'],a.get_text(),a1['href'],a1.get_text()] # print(wr) csv_write.writerow(wr)
def main1(): f2 = open('up_info.csv', 'a+', encoding='gb18030', newline='') csv_write2 = csv.writer(f2) for line in open('fsop_link.txt', 'r', encoding='utf-8'): line = eval(line) print(line[3]) url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=loadEnterpriseDetail&enterpriseId=' + line[ 3] bsObj = mytemp.getObj(url) # print(bsObj) t = re.search(r'\$.trim((.+?))}', str(bsObj)).group(1) # print(t) # break divlist = bsObj.find('div', 'am-u-md-10').find_all( 'div', class_='am-g am-margin-top') address = divlist[2].find( 'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace( '\t', '').replace('\n', '').replace('\xa0', '').replace('\r', '').strip() expirated = divlist[3].find( 'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace( '\t', '').replace('\n', '').replace('\r', '').strip().split(' ')[0] scan = divlist[4].find('div', class_='am-u-sm-8 am-u-md-8 am-u-end') Scannedname = scan.get_text().replace('\t', '').replace('\n', '').replace( '\r', '').strip() try: Scannedurl = 'http://fsop.caac.gov.cn' + scan.find( 'a').attrs['href'] except: Scannedurl = '' Limited = divlist[5].find( 'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace( '\t', '').replace('\n', '').replace('\r', '').strip() row = [url, t] + line[:3] + [ address, expirated, Scannedname, Scannedurl, Limited ] # print(row) csv_write2.writerow(row)
def getdetail(): f1 = open('company.csv', 'w+', encoding='gb18030', newline='') csv_write1 = csv.writer(f1) for line in csv.reader(open('city.csv', 'r', encoding='gb18030')): time.sleep(3) city_name = line[0] print(city_name) if city_name == '昆明': continue url_root1 = line[1] + 'company/list_' i = 1 while True: print(i) url = url_root1 + str(i) + '.html' bsObj = mytemp.getObj(url, False, cook) div1 = bsObj.find('div', class_='xgt_meitu_searchNone') if div1 != None: break div = bsObj.find('div', class_='default__company__list') try: lilist = div.find_all('li', class_='company-data ') except: print([city_name, i]) break # print(len(lilist)) for li in lilist: # print(li) href = li.find('a').attrs['href'] phone = '' try: phone = li.find( 'p', class_='company__phone').get_text().strip() except: pass name = li.find('p', class_='company__name').find( 'span', class_='name').get_text().strip() row = [city_name, name, phone, href] print(row) csv_write1.writerow(row) i = i + 1
def main1(): #族谱总链接 f1 = open('zupu.txt', 'w+', encoding='utf-8') for i in range(1, 100): url = 'http://zupu.syshenshi.com/Index.aspx?pageindex=' + str( i) + '&infoname=' bsObj = mytemp.getObj(url) trlist = bsObj.find('table', class_='stdtable').find('tbody').find_all('tr') le = len(trlist) print(le) if le == 0: break for tr in trlist: td = tr.find('td') name = td.get_text() href = td.find('a').attrs['href'] row = [name, href] print(row) f1.write(str(row) + '\n')
def get_link(): f1 = open('company_link.csv', 'w+', encoding='gb18030', newline='') csv_write = csv.writer(f1) for i in range(1, 51): if i == 1: url = 'https://www.atobo.com.cn/Companys/s-p26-s579/' else: url = 'https://www.atobo.com.cn/Companys/s-p26-s579-y' + str( i) + '/' bsObj = mytemp.getObj(url, False, cook) lilist = bsObj.find('div', class_='product_contextlist bplist').find_all( 'li', class_='product_box') for li in lilist: attrS = li.find('li', class_='pp_name').find('a').attrs title = attrS['title'] link = attrS['href'] row = [title, link] print(row) csv_write.writerow(row)
from bs4 import BeautifulSoup import requests import csv import sys from urllib.error import HTTPError sys.path.append("..") import mytemp url = 'http://www.jiangshi.org/search/kw_NULL_order_1_costmin_0_costmax_0_area_0_page_1.html' mytemp.getObj(url, )
attrS = li.find('li', class_='pp_name').find('a').attrs title = attrS['title'] link = attrS['href'] row = [title, link] print(row) csv_write.writerow(row) f2 = open('company_final.csv', 'a+', encoding='gb18030', newline='') csv_write2 = csv.writer(f2) f3 = open('company_error1.csv', 'w+', encoding='gb18030', newline='') csv_write3 = csv.writer(f3) for line in csv.reader(open('company_error.csv', 'r', encoding='gb18030')): url = line[1] print(url) bsObj = mytemp.getObj(url, True, cook).find('div', class_='card-context') try: ullist = bsObj.find_all('ul') except: time.sleep(2) bsObj = mytemp.getObj(url, True, cook).find('div', class_='card-context') try: ullist = bsObj.find_all('ul') except: csv_write3.writerow(line) continue name = '' tele = '' for ul in ullist: text = ul.get_text()
from urllib.error import HTTPError sys.path.append("..") import mytemp # lt=['https://jinbaohong.1688.com/page/offerlist.htm?spm=a2615.7691456.newlist.98.7feb58b2CWNKTL&showType=windows&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=',] # url_root1='https://dmqxtoy.1688.com/page/offerlist_96911367.htm?spm=a2615.7691456.newlist.101.29ea157aMi6TDa&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=' # url_root1='https://dmqxtoy1480648284896.1688.com/page/offerlist_90101145.htm?tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=' url_root1 = 'https://dmqxtoy.1688.com/page/offerlist.htm?spm=a2615.7691456.newlist.167.67e35eeaiIPXBE&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=' cook = '_csrf_token=1537943057961; cna=h9/yEVZzcBsCAXrNB8OEFnAP; JSESSIONID=t2xYdgd-GQSaYYwNgVts1vi3uB-8KTUj4R-xUw9; cookie2=13979f139a3eb43841d37d6bc89a466b; hng=CN%7Czh-CN%7CCNY%7C156; t=8464a4877acc365c142612d5c3535d73; _tb_token_=e76e553eeabb5; __cn_logon__=false; _tmp_ck_0=8DDX%2FdG7Qp4rD7%2BxIUDnt9IQQps0gXg0HxVorQnE1U9h1%2BJEDnSwTlDtAa6rnpNYZgyqZ04NLkFhoSZa62xDsmigmkh2C96Zwc3vUnWDAhJ43V6s20UdSFlrW8%2FNWLYvHlj66I9JrtQip0ZR%2BRGLKKCEOXBffuleBNW%2FlS0fLs8OnUFoyVr41%2ByFFe5lUUpWm0PqHiBd%2BiTQ%2BKt8xZ%2B19oW%2B3O9I5Fgowiwd5ki0VANNbNmhmFrqb%2Fjp9ubGsMV6YsxqeJspp5sJ2OoGdhC1HaX3ei%2BzZp7K%2F2ZEmTBCHj6Lr%2Frb4LupMxN1ER%2FNrE1Rayz%2Bf5KJZR%2FhvH7p5zFUPNE1n6nNZD6KoWyAUEbhjPQYWxDmAE23JAQ0E6hEqETG; UM_distinctid=166148de0821ae-0ff23d8e76a24-b353461-100200-166148de08414f; alicnweb=touch_tb_at%3D1537948661631; isg=BEBALzn_D6g1YfOsTPGJT6ajEc7YuyZOwdqqALrREdvqNeBfYt1bIxZPSN1Qptxr' f1 = open('dmqxtoy_f.csv', 'w+', newline='', encoding='gb18030') csv_write = csv.writer(f1) page_error = [] for i in range(1, 10000): print(i) bs_obj = mytemp.getObj(url_root1 + str(i), True, cook) common = bs_obj.find('div', class_='common-column-230') if common == None: try: common = bs_obj.find('div', class_='no-content').get_text() print(page_error) break except: page_error.append(i) li_list = common.find('ul', class_='offer-list-row').find_all( 'li', class_='offer-list-row-offer') for li in li_list: attrS = li.find('a', class_='title-link').attrs title = attrS['title'] link = attrS['href']
def get_gps(): fRead = open('attractions_msg_final.csv', 'r', encoding='gb18030') f_attractions_msg = open('attractions_msg_final1.csv', 'a+', newline='', encoding='gb18030') csv_write = csv.writer(f_attractions_msg) headrow = [ '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间', '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址', '交通', 'gps_链接' ] # csv_write.writerow(headrow) for line in csv.reader(fRead): for i in range(8, 12): t = line[i] if t != '': line[i] = t[:int(len(t) / 2)] if line[14] != '': continue line = line[:16] + [''] newUrl = line[3] print(newUrl) for i in range(3): bsObj = mytemp.getObj(newUrl) # req=util.build_proxy_request(newUrl) # bsObj=BeautifulSoup(req.text,'html.parser') div = bsObj.find('div', class_='c-list') if div != None: break if div == None: csv_write.writerow(line) continue divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox') for d in divlist2: dlist = d.find_all('div') title = dlist[0].get_text().strip() if title != '电话': continue try: text = d.find( 'div', class_='c-span12').find('a').attrs['href'].replace( 'Tel:', '') except: text = d.find('div', class_='c-span12').get_text().strip() line[13] = '_' + text try: gps_link = bsObj.find( 'div', class_='c-img c-img-item c-img-v').find('img').attrs['src'] except: gps_link = '' print('gps_link_error') try: address = bsObj.find('div', class_='map-address c-gap-top').find( 'p', class_='map-address-text').get_text() except: address = '' print('address_error') try: traffic = bsObj.find('div', class_='map-traffic c-gap-top').find( 'div', class_='map-traffic-container').get_text().replace('交通', '').strip() except: traffic = '' print('traffic_error') if line[15] == '': line[15] = traffic line[14] = address line[16] = gps_link csv_write.writerow(line)
def get_error_link1(): # # names=['城市','城市链接','景点','景点链接','缩略图链接','评分','简介'], # data1=pd.read_csv('attractions_msg_first.csv',header=None,encoding='gb18030') # data2=pd.read_table('attractions_get_error_link1.txt',header=None,encoding='gb18030') # data=pd.merge(data1,data2,left_on=3,right_on=0) # # find('div',class_='c-color c-gap-bottom c-line-clamp3') # data.to_csv('error.csv', sep='*',encoding='gb18030') f_attractions_msg = open('attractions_msg_final.csv', 'a+', newline='', encoding='gb18030') csv_write = csv.writer(f_attractions_msg) headrow = [ '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间', '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址' ] f_error3 = open('error3.csv', 'w+', encoding='gb18030', newline='') f_error2 = open('error4.txt', 'w+') csv.write1 = csv.writer(f_error3) data1 = pd.read_csv('attractions_msg_first.csv', header=None, encoding='gb18030') for attractions_link in open('attractions_get_error_link1.txt', 'r'): error = 0 r = data1[data1[3] == attractions_link.strip()].values[0][:6] row = list(r) + ['' for i in range(20)] newUrl = attractions_link # newUrl='http://m.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=jingdian_detail&resource_id=4616&word=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&title=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&lid=9159143487664320820&ms=1&frsrcid=31132&frorder=6' print(newUrl) for i in range(3): try: bsObj = mytemp.getObj(newUrl) # req=util.build_proxy_request(newUrl) # bsObj=BeautifulSoup(req.text,'html.parser') div = bsObj.find('div', class_='c-list') try: bdbk = div.find( 'div', class_='c-list-item c-line-bottom c-list-border' ).get_text().replace('百度百科', '') except: try: bdbk = div.find( 'div', class_='c-color c-gap-bottom c-line-clamp3' ).get_text().replace('百度百科', '') except: bdbk = '' break except: time.sleep(2) # pass if i == 4: print('error1') error = 1 csv.write1.writerow(r) if error == 1: continue try: divlist1 = div.find_all( 'div', class_= 'c-gap-inner-top c-flexbox c-line-bottom c-gap-inner-bottom') divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox') # print(divlist2) for d in divlist1: dlist = d.find_all('div') title = dlist[0].get_text().strip() text = d.find('div', class_='c-span12').get_text().strip() try: t = headrow.index(title) row[t] = text except: headrow.append(title) t = headrow.index(title) row[t] = text for d in divlist2: dlist = d.find_all('div') title = dlist[0].get_text().strip() try: text = d.find( 'div', class_='c-span12').find('a').attrs['href'].replace( 'Tel:', '') except: text = d.find('div', class_='c-span12').get_text().strip() try: t = headrow.index(title) # print(t) row[t] = text except: headrow.append(title) print(headrow) t = headrow.index(title) row[t] = text except: print('error2') f_error2.write(r + '\n') # print(row) csv_write.writerow(row) # break print(headrow)
def get_detail(): fRead = open('attractions_msg_first.csv', 'r', encoding='gb18030') f_attractions_msg = open('attractions_msg_final.csv', 'a+', newline='', encoding='gb18030') csv_write = csv.writer(f_attractions_msg) headrow = [ '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间', '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址' ] # csv_write.writerow(headrow) # f_error1=open('attractions_get_error_link1.csv','w+') # csv_write1=csv.writer(f_error1) # f_error2=open('attractions_get_error_link2.csv','w+') # csv_write1=csv.writer(f_error2) for line in csv.reader(fRead): error = 0 line = line[:7] # print(line) row = line + ['' for i in range(14)] # try: newUrl = line[3] # newUrl='http://m.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=jingdian_detail&resource_id=4616&word=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&title=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&lid=9159143487664320820&ms=1&frsrcid=31132&frorder=6' print(line[0], line[2]) for i in range(3): bsObj = mytemp.getObj(newUrl) # req=util.build_proxy_request(newUrl) # bsObj=BeautifulSoup(req.text,'html.parser') div = bsObj.find('div', class_='c-list') if div != None: break if div == None: csv_write.writerow(line) continue try: bdbk = div.find('div', class_='c-list-item c-line-bottom c-list-border' ).get_text().replace('百度百科', '') except: try: bdbk = div.find('div', class_='c-color c-gap-bottom c-line-clamp3' ).get_text().replace('百度百科', '') except: bdbk = '' row[7] = bdbk divlist1 = div.find_all( 'div', class_='c-gap-inner-top c-flexbox c-line-bottom c-gap-inner-bottom' ) divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox') for d in divlist1: dlist = d.find_all('div') title = dlist[0].get_text().strip() text = d.find('div', class_='c-span12').get_text().strip() try: t = headrow.index(title) row[t] = text except: headrow.append(title) t = headrow.index(title) row[t] = text for d in divlist2: dlist = d.find_all('div') title = dlist[0].get_text().strip() try: text = d.find( 'div', class_='c-span12').find('a').attrs['href'].replace( 'Tel:', '') except: text = d.find('div', class_='c-span12').get_text().strip() try: t = headrow.index(title) # print(t) row[t] = text except: headrow.append(title) print(headrow) t = headrow.index(title) row[t] = text csv_write.writerow(row) # break print(headrow)
import requests import csv import time import json import re import sys from urllib.error import HTTPError sys.path.append("..") import mytemp url='https://huggies.tmall.com/category-1038993307.htm?spm=a1z10.3-b-s.w4011-14533183956.292.787976ceiOvHAT&type=p&newHeader_b=s_from&searcy_type=item&from=.shop.pc_2_searchbutton&catId=1038993307&keyword=%BA%C3%C6%E6&pageNo=2&tsearch=y#anchor' cook='cna=h9/yEVZzcBsCAXrNB8OEFnAP; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=64705fff8ba6e9dd50853f5f5b9929cb_1539873398765; _m_h5_tk_enc=5301f3c7015e41bb1026840101c64f24; t=8464a4877acc365c142612d5c3535d73; _tb_token_=753361be35e5a; cookie2=1a3195a39b1ad24bf437640212c57659; pnm_cku822=; cq=ccp%3D1; isg=BFRUCI2p88yHkGdJey2xbUrjJZLMozARvabeTO41QF9Z2fUjFr4_J6Mf3ZFkIbDv' header={ 'Cookie':cook, # referer:https://detail.tmall.com/item.htm?spm=a1z10.3-b-s.w4011-14466283798.50.43547440H0DfBt&id=40545566432&rn=6fe19d846dcf80351798b26fa5bfdf95&abbucket=20&skuId=3854687709703' } bsObj=mytemp.getObj(url,False,cook) print(bsObj) # print(bsObj.find('div',class_='J_TItems'))