def getRoomList(cursor, conn, url, city_name): qu = [] qu_name = [] newurl = '' if (url.find('zufang') == -1): newurl = url + 'zufang/' else: newurl = url print(newurl) respone = requests.get(newurl, headers=local_headers) respone.encoding = getEncoding(newurl).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') areas = soup.find('div', id='filter-options').find_all('dl')[0].find_all('a') areas = areas[1:] for i in areas: qu_name.append(i.string) if (i['href'].find('https:') == -1): qu.append(url + i['href']) else: qu.append(i['href']) # print(qu) print(qu_name) for i in qu: try: saveInfo(cursor, conn, i, 1, qu, city_name, qu_name) # print(qu.index(i)) except Exception as e: print(e)
def getCityList(self): print('获取城市列表') respone = requests.get(self.url, headers=local_headers) respone.encoding = getEncoding(self.url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') divs = soup.find('div', class_='all').find_all('ul', class_='clear') print('获取成功') print('开始入库') conn = pymysql.connect(host='localhost', user='******', password='', db='lianjia', charset='utf8') cursor = conn.cursor() for i in divs: lis = i.find_all('li') for j in lis: print(j.a['href']) print(j.a.string) sql = "insert into city_list(id, city_name, city_link)values (null,'%s','%s')" % ( j.a.string, j.a['href']) # print(sql) cursor.execute(sql) conn.commit() cursor.close() conn.close() print('入库成功')
def getJobInfo(page_url, pages_num, isFirst): if pages_num.count(page_url) != 0: # 列表中已经含有此列表 return False if isFirst == 1: pages_num = [] url = page_url else: url = base + page_url + '&ka=page-next' pages_num.append(page_url) respone = requests.get(url, headers=local_headers) respone.encoding = getEncoding(test_url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') # dls = soup.find(class_='condition-district show-condition-district').find_all('a') # for d in dls[1:]: # print(d.string) # print(base + d['href'] + '?' + d['ka']) # # print(d) lis = soup.find('div', class_='job-list').find_all('li') for i_ in lis: # i_ = lis[0] # 工作名字 job_name job_name = i_.find(class_='job-title').string # 公司名字 job_company job_company = i_.find(class_='company-text').find('a').string # 工作要求 job_require job_require = i_.find('div', class_='info-primary').p.text # 公司信息 job_company_info job_company_info = i_.find('div', class_='company-text').p.text # 公司招聘人 job_people job_people = i_.find('div', class_='info-publis').h3.text # 公司详细网址 job_link = i_.find('div', class_='info-primary').h3.a['href'] # 工资 job_money = i_.find('div', class_='info-primary').h3.find(class_='red').string # 工资_下限 job_min_money = job_money.split('-')[0] # 工资_上限 job_max_money = job_money.split('-')[1] # print('工作名字-' + job_name) # print('公司名字-' + job_company) # print('公司名字-' + job_require) # print('公司名字-' + job_company_info) # print('公司招聘人-' + job_people) # print('公司详细网址-' + job_link) # print('工资-' + job_money) # print('工资_下限-' + job_min_money) # print('工资_上限-' + job_max_money) print(job_name + page_url) # if soup.find('div', class_='page').find_all('a')[-1]['href'] == 'javascript:;': # getJobInfo("",pages_num,3) # else: getJobInfo( soup.find('div', class_='page').find_all('a')[-1]['href'], pages_num, 2)
def getpiclist(num, type, title, cursor, conn, link): link_ = link[:len(link) - 5] if num == 1: respone = requests.get(link, headers=ua_headers) else: respone = requests.get(link_ + "_" + str(num) + ".html", headers=ua_headers) respone.encoding = getEncoding(base + tag).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') # 获取页码 和第一页入库 bottom_pages = soup.find_all('div', class_='pages')[0].find_all('li') # print(pages[0].a.string[2]) bg = bottom_pages[0].a.string page = int(bg.replace('共', '').replace('页:', '')) if page == 1: page = 1 pic = soup.find('div', id='big-pic').find('img')['src'] # print(pic) # 执行存储语句 sql = "insert into weiyi_images(id, type, title, url)values (null,'%s','%s','%s')" % ( type, title, pic) print(sql) cursor.execute(sql) conn.commit() # 判断是否有第二页,没有的话入下一个tag if page == 1: return else: num += 1 if num <= page: getpiclist(num, type, title, cursor, conn, link) else: return
def test(): respone = requests.get(test_url, headers=local_headers) respone.encoding = getEncoding(test_url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') dls = soup.find( class_='condition-district show-condition-district').find_all('a') for d in dls[1:]: print(d.string) print(base + d['href'] + '?' + d['ka']) # print(d) lis = soup.find('div', class_='job-list').find_all('li') i_ = lis[0] # 工作名字 job_name job_name = i_.find(class_='job-title').string # 公司名字 job_company job_company = i_.find(class_='company-text').find('a').string # 工作要求 job_require job_require = i_.find('div', class_='info-primary').p.text # 公司信息 job_company_info job_company_info = i_.find('div', class_='company-text').p.text # 公司招聘人 job_people job_people = i_.find('div', class_='info-publis').h3.text # 公司详细网址 job_link = i_.find('div', class_='info-primary').h3.a['href'] # 工资 job_money = i_.find('div', class_='info-primary').h3.find(class_='red').string # 工资_下限 job_min_money = job_money.split('-')[0] # 工资_上限 job_max_money = job_money.split('-')[1]
def startSpider(self): conn = pymysql.connect(host='localhost', user='******', password='', db='biquge', charset='utf8') cursor = conn.cursor() respone = requests.get(base, headers=local_headers) respone.encoding = getEncoding(base).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') for i in ids: # 大分类 print(types[ids.index(i)]) title_type = types[ids.index(i)] lis = soup.find(id=i).find_all('li') for j in lis: # 小分类 print(j.a['title']) print(j.a['href']) sql = "insert into bqg_list(id, type, name, book_link)values (null,'%s','%s','%s')" % ( title_type, j.a['title'], j.a['href']) cursor.execute(sql) conn.commit() cursor.close() conn.close()
def downloadImage(url): # response = requests.get(url + '', headers=ua_headers) # img = response.content # with open('./a.jpg', 'wb') as f: # f.write(img) respone = requests.get(url) respone.encoding = getEncoding(url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') print(soup)
def bookInfo(): print() respone = requests.get("https://www.qu.la/book/4140/2585313.html", headers=local_headers) respone.encoding = getEncoding( "https://www.qu.la/book/4140/2585313.html").get_encode2() soup = BeautifulSoup(respone.text, "html.parser") text = soup.find(id="content").text.replace(" ", "").replace("<br />", "\n") print(text)
def init(adr): test_url = 'http://www.panduoduo.net/s/name/' + "变形金刚" respone = requests.get(test_url, headers=local_headers) respone.encoding = getEncoding(test_url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') soup = soup.select('.row') pattern = re.compile('\/r\/\d+') for i in soup: i = str(i) adress = pattern.search(i) adress = adress.group() adr.append(adress)
def getSpiderUrl(): respone = requests.get(test_url, headers=local_headers) respone.encoding = getEncoding(test_url).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') dls = soup.find( class_='condition-district show-condition-district').find_all('a') areas = [] for d in dls[1:]: area = [] area.append(d.string) area.append(base + d['href'] + '?' + d['ka']) area.append(base + d['href']) # print(d.string) # print(base+d['href']+'?'+d['ka']) areas.append(area) return areas
def getinfo(tag_, type_, num): print("正在获取此分类---" + type_) conn = pymysql.connect(host='localhost', user='******', password='', db='weiyi', charset='utf8') cursor = conn.cursor() if num == 1: respone = requests.get("http://www.mmonly.cc" + tag_, headers=ua_headers) else: respone = requests.get("http://www.mmonly.cc" + tag_ + str(num) + ".html", headers=ua_headers) respone.encoding = getEncoding(base + tag).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') # 获取页码 和第一页入库 pages = soup.find_all('div', class_='pages')[0].find_all('li') if (len(pages) == 1): page = 1 else: page = len(pages) - 2 print("该" + tag_ + "分类有" + str(page)) divs = soup.find_all('div', class_='item masonry_brick masonry-brick') for i in divs: a = i.find('a', class_='img_album_btn') if i.find('b'): title = i.find('b').string else: title = i.find('div', class_='title').find('a').string # print(title) # print(i.find_all('img')[0]['src']) try: getpiclist(1, type_, title, cursor, conn, a['href']) except Exception as e: print(e) # 判断是否有第二页,没有的话入下一个tag if page == 1: return else: num += 1 if num <= page: getinfo(tag_, type_, num)
def saveInfo(cursor, conn, url, num, qu, city_name, qu_name): if num > 1: newurl = url + 'pg' + str(num) + '/' else: newurl = url print(newurl + '当前是第' + str(num) + '页') respone = requests.get(newurl, headers=local_headers) respone.encoding = getEncoding(newurl).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') try: page = int( soup.find('div', class_='page-box house-lst-page-box') ['page-data'].split(',')[0].split(':')[1]) print('共' + str(page) + '页') except Exception as e: page = 1 lis = soup.find(id='house-lst').find_all('li') for i in lis: div_ = i.find('div', class_='info-panel') # 租房子的价格 price = int(i.find('div', class_='price').span.string) info = i.find('div', class_='con').text size = int( i.find('span', class_='meters').text.replace('平米', '').strip()) style = i.find('span', class_='zone').text.strip() adress = i.find('span', class_='region').text.strip() sql = "INSERT INTO city_room_list(id,city_name,city_area,adress,style,size,price,info) VALUES (null,'%s','%s','%s','%s',%d,%d,'%s')" % ( city_name, qu_name[qu.index(url)], adress, style, size, price, info) print(sql) cursor.execute(sql) conn.commit() # print(adress) if page == num: return num += 1 if num <= page: saveInfo(cursor, conn, url, num, qu, city_name, qu_name)
def catchTag(): respone = requests.get(base + tag, headers=ua_headers) respone.encoding = getEncoding(base + tag).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') # 获取纵列 lists_tag = soup.find_all('h2') for i in lists_tag: tag_title.append(i.string) lists = soup.find_all('div', class_='TagList') for i in lists: if i != '': links = i.find_all('a') tag_list = [] for j in links: temp_ = [] temp_.append(j['href']) temp_.append(j['title']) tag_list.append(temp_) tag_dic[tag_title[lists.index(i)]] = tag_list
def getbaidu(adr): for i in adr: respone = requests.get('http://www.panduoduo.net' + i, headers=local_headers) respone.encoding = getEncoding('http://www.panduoduo.net' + i).get_encode2() # url = urllib.urlopen('http://www.panduoduo.net'+i) bs = BeautifulSoup(respone.text, 'html.parser') bs1 = bs.select('.dbutton2') href = re.compile('http\%(\%|\d|\w|\/\/|\/|\.)*') b = href.search(str(bs1)) name = str(bs.select('.center')).decode('utf-8') text1 = re.compile('\<h1\sclass\=\"center"\>[\d|\w|\D|\W]*\</h1\>') text2 = text1.search(name) rag1 = re.compile('\>[\d|\w|\D|\W]*\<') if text2: text3 = rag1.search(text2.group()) if text3: print(text3.group()) if b: text = urllib.unquote(str(b.group())).decode('utf-8') print(text)
def getRoomList2(url): qu = [] qu_name = [] newurl = '' if (url.find('zufang') == -1): newurl = url + 'zufang/' else: newurl = url print(newurl) respone = requests.get(newurl, headers=local_headers) respone.encoding = getEncoding(newurl).get_encode2() soup = BeautifulSoup(respone.text, 'html.parser') areas = soup.find('div', id='filter-options').find_all('dl')[0].find_all('a') areas = areas[1:] for i in areas: qu_name.append(i.string) if (i['href'].find('https:') == -1): qu.append(url + i['href']) else: qu.append(i['href']) # print(qu) print(qu_name)
def titleSpider(self): conn = pymysql.connect(host='localhost', user='******', password='', db='biquge', charset='utf8') cursor = conn.cursor() # sql = "select * from bqg_list" # cursor.execute(sql) # books = cursor.fetchall() # print(books) books = ((99, '女生频道排行', '俏汉宠农妻:这个娘子好辣', '/book/42186/'), (100, '女生频道排行', '重生七十年代:军嫂,有点田', '/book/60629/'), (101, '女生频道排行', '绝宠妖妃:邪王,太闷骚!', '/book/28906/'), (102, '女生频道排行', '鬼帝狂妻:纨绔大小姐', '/book/40082/'), (103, '女生频道排行', '帝国总裁,宠翻天!', '/book/42973/'), (104, '女生频道排行', '萌宝来袭:总裁爹地,太给力!', '/book/45213/'), (105, '女生频道排行', '空间重生:盛宠神医商女', '/book/34335/'), (106, '完本小说总排行', '太古神王', '/book/4140/'), (107, '完本小说总排行', '大主宰', '/book/176/'), (108, '完本小说总排行', '雪鹰领主', '/book/5094/'), (109, '完本小说总排行', '择天记', '/book/168/'), (110, '完本小说总排行', '全职高手', '/book/32/'), (111, '完本小说总排行', '完美世界', '/book/14/'), (112, '完本小说总排行', '遮天', '/book/394/'), (113, '完本小说总排行', '逍遥小书生', '/book/23934/'), (114, '完本小说总排行', '绝世武神', '/book/322/'), (115, '完本小说总排行', '异世灵武天下', '/book/199/'), (116, '完本小说总排行', '不朽凡人', '/book/18049/'), (117, '完本小说总排行', '掠天记', '/book/4295/'), (118, '完本小说总排行', '最强兵王', '/book/4511/'), (119, '完本小说总排行', '都市无上仙医', '/book/3802/'), (120, '完本小说总排行', '斗破苍穹', '/book/390/')) for book in books: print("当前是:" + book[2]) respone = requests.get(base_book + book[3], headers=local_headers) respone.encoding = getEncoding(base_book + book[3]).get_encode2() soup = BeautifulSoup(respone.text, "html.parser") dds = soup.find(id='list').find_all('dd') for i in dds: try: zu = i.find('a')['href'].split('/') if (len(zu) < 2): print("排除" + i.a['href']) else: print("当前是第" + i.find('a').string) title = i.find('a').string.replace('\\', "") sql = "insert into bqg_chapter(chapterid ,id, name, chapter,chapter_link)values (null,%d,'%s','%s','%s')" % ( book[0], book[2], title, i.find('a')['href']) cursor.execute(sql) conn.commit() except Exception as e: print(e) cursor.close() conn.close()