def get_content(htmml, page): output = """page {}, author: {}, sex: {}, age: {}, thumb: {}, comment: {}\n{}\n-----\n""" #output format soup = BeautifulSoup(htmml, 'html.parser') con = soup.find(name='div', attrs={'class': 'col1 old-style-col1'}) #my_print(con) con_list = con.find_all('div', class_='article') for i in con_list: my_print(i) author = i.find('h2').string content = i.find('div', class_='content').find('span').get_text() stats = i.find('div', class_='stats') vote = stats.find('span', class_='stats-vote').find('i', class_='number').string comment = stats.find('span', class_='stats-comments').find( 'i', class_='number').string author_info = i.find('div', class_='articleGender') if author_info is not None: class_list = author_info['class'] if 'womenIcon' in class_list: gender = 'women' elif 'manIcon' in class_list: gender = 'man' else: gender = '' age = author_info.string else: gender = '' age = '' save_txt( output.format(page, author, gender, age, vote, comment, content))
def get_cookie(self): url = "https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&suginput=".format(self.job) s = requests.Session() bs_myprint.my_print(self.url) h = self.get_headers() #h = urlencode({'headers':self.get_headers()}) bs_myprint.my_print(h) s.get(url, data=self.data, headers=h, timeout=5) return s.cookies
def get_pic_list(url): html = download_page(url) soup = BeautifulSoup(html, 'html.parser') box_list = soup.find_all('div', class_='list4-box') ref = get_url_ref(url)[:-1] for box in box_list: a_tag = box.find('li', class_='title').find('a') link = a_tag.get('href') my_print(ref + link) #html = download_page(ref + link) get_pic(ref + link) time.sleep(1)
def main(): queue = [i for i in range(1, 2)] threads = [] while len(queue) > 0 or len(threads) > 0: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < 1 and len(queue) > 0: cur_page = queue.pop(0) url = 'https://www.51tietu.net/xiezhen/{}'.format(cur_page) thread = threading.Thread(target=get_pic_list, args=(url, )) thread.setDaemon(True) thread.start() my_print('down load page {}'.format(cur_page)) threads.append(thread)
def get_pic(link): html = download_page(link) soup = BeautifulSoup(html, 'html.parser') pic_box = soup.find('div', class_='pic-box') title = soup.find('h1').string.strip() remove = string.punctuation table = str.maketrans('', '', remove) title = title.translate(table) create_dir(RESULT_DIR + '{}'.format(title)) my_print(RESULT_DIR + '{}'.format(title)) pic_list = pic_box.find_all('img') i = 0 headers = get_headers(link) for pic in pic_list: link = pic.get('src') if os.path.exists(RESULT_DIR + '{}/'.format(title) + str(i) + '.jpg'): i = i + 1 continue my_print(link, DEBUG) session = requests.Session() session.mount('https://', HTTPAdapter(max_retries=3)) try: r = session.get(link, headers=headers, timeout=5) with open(RESULT_DIR + '{}/'.format(title) + str(i) + '.jpg', 'wb') as f: i = i + 1 f.write(r.content) time.sleep(1) except requests.exceptions.ConnectionError as e: print('url failed')
def get_json_page(self, page): self.page = page data = self.data#{'first':'true', 'pn':str(page), 'kd':self.job} html = requests.post(self.url, data=data, headers=self.get_headers(), cookies=self.get_cookie(), timeout=3 ) bs_myprint.my_print(html.status_code) result_json = json.loads(html.text) if result_json['msg'] is not None or result_json['msg'] is 'null': bs_myprint.my_print(result_json) return None list_con = result_json['content']['positionResult']['result'] list_hr = result_json['content']['hrInfoMap'] # dic info_list = [] info_hr = [] for k in list_con: info = [] info.append(k.get('companyShortName', '无')) info.append(k.get('companyFullName', '无')) info.append(k.get('industryField', '无')) info.append(k.get('financeStage', '无')) info.append(k.get('companySize', '无')) info.append(k.get('salary', '无')) info.append(k.get('city', '无')) info.append(k.get('education', '无')) info_list.append(info) bs_myprint.my_print(info) for k in list_hr.keys(): value = list_hr[k] hr = [] hr.append(value.get('realName', '无')) hr.append(value.get('userId', '无')) if value.get('portrait', '无') is not None: hr.append('https://www.lgstatic.com/'+ value.get('portrait', '无')) info_hr.append(hr) bs_myprint.my_print(hr) return info_list, info_hr
def get_json(url, page, lang_name): data = {'first': 'true', 'pn': str(page), 'kd': lang_name} #session_data = requests.Session() #session_data.get(url, headers=headers, timeout=3) #cookie_data = get_cookie() #time.sleep(3) html = requests.post(url, data=data, headers=headers, cookies=get_cookie(), timeout=5) #result_json = session_data.post(url, data, headers=headers).json() #result_json = requests.post(url, data, headers=headers).json() result_json = json.loads(html.text) if result_json['msg'] is not None: bs_myprint.my_print(result_json) return None list_con = result_json['content']['positionResult']['result'] list_hr = result_json['content']['hrInfoMap'] # dic info_list = [] info_hr = [] for k in list_con: info = [] info.append(k.get('companyShortName', '无')) info.append(k.get('companyFullName', '无')) info.append(k.get('industryField', '无')) info.append(k.get('financeStage', '无')) info.append(k.get('companySize', '无')) info.append(k.get('salary', '无')) info.append(k.get('city', '无')) info.append(k.get('education', '无')) info_list.append(info) bs_myprint.my_print(info) for k in list_hr.keys(): value = list_hr[k] hr = [] hr.append(value.get('realName', '无')) hr.append(value.get('userId', '无')) if value.get('portrait', '无') is not None: hr.append('https://www.lgstatic.com/' + value.get('portrait', '无')) info_hr.append(hr) bs_myprint.my_print(hr) return info_list, info_hr