def getObj(url, proxyIs=False, cook=None, encode='utf-8'): for i in range(5): try: if cook == None: if proxyIs == False: html = requests.get(url, timeout=15) else: html = util.build_proxy_request(url) else: if proxyIs == False: html = requests.get(url, headers=getHeaders(cook), verify=False) else: header = getHeaders(cook) html = util.build_proxy_request(url, None, getHeaders, None) html.encoding = encode break except HTTPError as e: time.sleep(10) print('再次请求') print('HTTPError') return None try: bsObj = BeautifulSoup(html.text, 'html.parser') except AttributeError as e: print('AttributeError') return None return bsObj
def main2(): root2='http://www.mafengwo.cn' f_error1=open('logtime.txt','a+') f_error2=open('logplace.txt','a+') f1=open('qd1.csv','r',encoding='gb18030',newline='') f=open('qd2.csv','a+',newline='',encoding='gb18030') csv_write=csv.writer(f) for info in csv.reader(line.replace('\0','') for line in f1): newUrl=root2+info[0] html=util.build_proxy_request(newUrl) obj = BeautifulSoup(html.text,'html.parser') place=set() for p in obj.find_all('a',class_='_j_anchor'): place.add(p.get_text().strip()) if place==set(): print(newUrl+'没有景点') f_error2.write(newUrl) continue try: time=obj.find('li',class_='time').get_text().split('/')[1] day=obj.find('li',class_='day').get_text().split('/')[1] except: # recordtime=obj.find('span',class_='time') print(newUrl+'没有时间') f_error1.write(newUrl) continue row=[newUrl,info[1],time,day,' '.join(place)] print(row) csv_write.writerow(row)
def main3(): # ```没有时间的处理 # f_error3=open('logconfplace.txt','w+') # f_error4=open('notime.txt','w+') f1=open('logplace.txt','r',encoding='gb18030',newline='') f=open('qd2.csv','a+',newline='',encoding='gb18030') csv_write=csv.writer(f) text=f1.read() textlist=text.split('http') # print(textlist) print(len(textlist)) for i in range(1,len(textlist)): newUrl='http'+textlist[i] # newUrl='http://www.mafengwo.cn/i/1373402.html' # print(newUrl) html=util.build_proxy_request(newUrl) obj = BeautifulSoup(html.text,'html.parser') try: time=obj.find('li',class_='time').get_text().split('/')[1] day=obj.find('li',class_='day').get_text().split('/')[1] except: time='' day='' place=set() for p in obj.find_all('a',class_='_j_anchor'): place.add(p.get_text().strip()) for p in obj.find_all('a',class_='_j_keyword_list'): place.add(p.get_text().strip()) if place==set(): print(newUrl+'没有景点') # f_error3.write(newUrl) continue row=[newUrl,time,day,' '.join(place)] print(row) csv_write.writerow(row)
def get_city_again(): f_city_msg = open('city_msg.csv', 'a+', newline='', encoding='gb18030') csv_write = csv.writer(f_city_msg) f_error = open('city_error2.txt', 'w+') for line in open('city_error1.txt', 'r'): # print(line) try: newUrl = line city = re.search(r'word=(.+?)&', newUrl).group(1) req = util.build_proxy_request(newUrl) bsObj = BeautifulSoup(req.text, 'html.parser') div = bsObj.find('div', class_='ly-city') image_link = div.find( 'div', class_='c-row-tile c-gap-bottom ly-city-info-position').find( 'img').attrs['src'] try: suit_season = div.find( 'div', class_='c-line-clamp1 c-span6').get_text().replace( '适宜季节:', '').replace('建议游玩:', '') except: suit_season = '' try: bdbk = div.find('span', class_='c-color').get_text() except: bdbk = div.find('div', class_='c-color c-line-clamp3').get_text() row = [city, newUrl, image_link, suit_season, bdbk] print(row) csv_write.writerow(row) except: print(newUrl) f_error.write(newUrl + '\n')
def get_city_mag(): #城市信息链接 urlroot1 = 'https://www.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=city&ms=1&hide=1&apitn=tangram&top=%7B%22sfhs%22%3A2%7D&tfr=redis&resource_id=4324&word=' urlroot2 = '&title=%E7%9B%AE%E7%9A%84%E5%9C%B0%E6%94%BB%E7%95%A5&city_name=&frsrcid=&frorder=&lid=&ext=%7B%22sf_tab_name%22%3A%22%E6%A6%82%E8%A7%88%22%7D&sa=sf_tab1' fRead = open('city.csv', 'r', encoding='gb18030') f_city_msg = open('city_msg.csv', 'w+', newline='', encoding='gb18030') f_error = open('city_error.txt', 'w+') csv_write = csv.writer(f_city_msg) csv_write.writerow(['城市', '链接', '缩略图链接', '适宜季节', '百度百科', '', '']) for line in csv.reader(fRead): try: newUrl = urlroot1 + line[0] + urlroot2 req = util.build_proxy_request(newUrl) bsObj = BeautifulSoup(req.text, 'html.parser') div = bsObj.find('div', class_='ly-city') image_link = div.find( 'div', class_='c-row-tile c-gap-bottom ly-city-info-position').find( 'img').attrs['src'] suit_season = div.find( 'div', class_='c-line-clamp1 c-span6').get_text().replace( '适宜季节:', '') bdbk = div.find('span', class_='c-color').get_text() row = [line[0], newUrl, image_link, suit_season, bdbk] print(row) csv_write.writerow(row) except: print(newUrl) f_error.write(newUrl + '\n')
def main1(): f=open('qdtime.csv','w+',newline='',encoding='gb18030') csv_write=csv.writer(f) #708 for i in range(0,720): newUrl=urlRoot+str(i)+'.html' # print('正在爬取第'+str(i)+'页...') for i in range(5): try: html=util.build_proxy_request(newUrl) obj = BeautifulSoup(html.text,'html.parser') lilist=obj.find('div',class_='post-list').find_all('li',class_='post-item clearfix') break except: # print('第'+str(i)+'次尝试') if i==4: print('没找到'+newUrl) # time.sleep(5) # obj = BeautifulSoup(html.text,'html.parser') # lilist=obj.find('div',class_='post-list').find_all('li',class_='post-item clearfix') print(len(lilist)) for li in lilist: a=li.find('h2').find_all('a')[-1] time=li.find('span',class_='comment-date').get_text().split(' ')[0] if 'href' not in a.attrs: print('position error') else: wr=[a['href'],time] # print(wr) csv_write.writerow(wr)
def getlink(): f = open('ebay_toys.csv', 'w+', newline='', encoding='gb18030') csv_write = csv.writer(f) f1 = open('log.txt', 'w+') for i in range(1, 100): newUrl = url + str(i) for j in range(5): try: req = util.build_proxy_request(newUrl) bsObj = BeautifulSoup(req.text, 'html.parser') lilist = bsObj.find( 'ul', class_='b-list__items_nofooter').find_all('li') break except: pass if j == 4: #该页取不到数据 f1.writelines(str(i) + '\n') lilist = [] for li in lilist: name = '' the_url = '' watch_sold = '' location = '' try: name = li.find('h3', class_='s-item__title').get_text() the_url = li.find('a', class_='s-item__link').attrs['href'] except: pass try: watch_sold = li.find('span', class_='NEGATIVE').get_text() except: pass try: location = li.find( 'span', class_='s-item__location s-item__itemLocation').get_text() except: pass row = [name, the_url, watch_sold, location] print(row) csv_write.writerow(row)
def getAttractions(): urlroot3 = 'https://www.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=city&ms=1&hide=1&apitn=tangram&top=%7B%22sfhs%22%3A2%7D&tfr=redis&resource_id=4336&word=' urlroot4 = '&title=%E7%9B%AE%E7%9A%84%E5%9C%B0%E6%94%BB%E7%95%A5&city_name=&frsrcid=&frorder=&lid=&ext=%7B%22sf_tab_name%22%3A%22%E6%99%AF%E7%82%B9%22%7D&sa=sf_tab1' fRead = open('city.csv', 'r', encoding='gb18030') f_attractions_msg = open('attractions_msg_first.csv', 'w+', newline='', encoding='gb18030') f_error = open('attractions_get_error.txt', 'w+') csv_write = csv.writer(f_attractions_msg) csv_write.writerow( ['城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '', '']) for line in csv.reader(fRead): rowlist = [] try: newUrl = urlroot3 + line[0] + urlroot4 req = util.build_proxy_request(newUrl) bsObj = BeautifulSoup(req.text, 'html.parser') divlist = bsObj.find( 'div', class_='sfc-ly-scene-list-wrap').find_all( 'div', class_='sfc-ly-scene-list-item c-container WA_LOG_SF') for div in divlist: name = div.find( 'span', class_='sfc-ly-scene-list-item-name').get_text() link = div.attrs['data-href'] img_link = div.find( 'div', class_='c-img c-img-z').find('img').attrs['src'] score = div.find( 'span', class_='sfc-ly-scene-list-item-score').get_text() introduction = div.find( 'p', class_='c-line-clamp2 sfc-ly-scene-list-item-desc' ).get_text() row = [ line[0], newUrl, name, link, img_link, score, introduction ] rowlist.append(row) for r in rowlist: csv_write.writerow(r) print(line[0] + '已写入') except: print(newUrl) f_error.write(newUrl + '\n')
def getObj(url, proxyIs=False, header=None, encode='utf-8'): for i in range(5): try: if proxyIs == False: html = requests.get(url, headers=header) else: html = util.build_proxy_request(url) html.encoding = encode break except HTTPError as e: # time.sleep(3) print('再次请求') print('HTTPError') return None try: bsObj = BeautifulSoup(html.text, 'html.parser') except AttributeError as e: print('AttributeError') return None return bsObj
def crawl(url): f3 = open('logrwid.txt', 'w+') review = [] for j in range(5): try: req = util.build_proxy_request(url) bsObj = BeautifulSoup(req.text, 'html.parser') # print(bsObj) body = bsObj.find('div', {'id': 'BottomPanelDF'}) break except: pass if j == 4: #该页取不到数据 f3.writelines(url + '\n') try: divlist = body.find('div', { 'id': 'rwid' }).find_all('div', class_=' ebay-review-section') except: return [] for div in divlist: title = '' content = '' try: title = div.find( 'p', class_='review-item-title wrap-spaces').get_text() except: pass try: content = div.find( 'p', class_='review-item-content wrap-spaces').get_text() except: pass review.append(title + ':::' + content) return review