def download_pic(tel: str): url = 'http://www.114best.com/dh/114.aspx?w=' + tel bsObj = spider_util.open_url_return_bsobj(url, 5, 20, from_encoding='gbk') try: img_tag = bsObj.select_one('#span_gsd img') if img_tag is None: return None src = img_tag.get('src') gif_link = urljoin(url, src) local_filename = 'D:\\tmp\\wj\\temp.gif' spider_util.download(gif_link,local_filename,3,10) # request.urlretrieve(gif_link, local_filename) fp = open(local_filename, 'rb') img = Image.open(fp) Img = img.convert('L') threshold = 200 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) # 图片二值化 photo = Img.point(table, '1') uuid_str = str(uuid.uuid1()) black_pic = 'D:\\tmp\\wj\\temppng\\'+uuid_str+'.png' photo.save(black_pic) fp.close() os.remove(local_filename) # 删除gif文件 return black_pic except Exception as e: print('发生异常行为',repr(e)) return None
def doRequest(StockCode=''): url = f'http://s.askci.com/stock/financialanalysis/{StockCode}' bsobj = spider_util.open_url_return_bsobj(url) title_tags = bsobj.select('.right_f_c_tltie') table_tags = bsobj.select('.right_f_d_table.mg_tone table') for i, title_tag in enumerate(title_tags): title = title_tag.get_text().strip() table_tag = table_tags[i] table_handle(table_tag, i, StockCode)
def open(self, url: str, func, self_rotation=5, timeout=5, data=None, from_encoding="utf-8"): result = spider_util.open_url_return_bsobj(url, self_rotation, timeout, data, from_encoding) return func(result)
def query_mobile_phone_location(tel: str): url = 'http://www.zou114.com/shouji/?mobile=' + tel bsObj = spider_util.open_url_return_bsobj( url, 5, 20, from_encoding='gbk' ) # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252 div = bsObj.select_one('.nrbnr') tags = div.find_all('font', {'color': 'red'}) text = '' if tags is not None: text = tags[0].get_text() + ' ;卡类型:' + tags[1].get_text() return text
def query_telphone_location_114best(tel: str): url = 'http://www.114best.com/dh/114.aspx?w=' + tel bsObj = spider_util.open_url_return_bsobj( url, 5, 20, from_encoding='gbk' ) # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252 tags = bsObj.find_all('font', {'size': 4}) if tags is None: tags = bsObj.find_all('font', {'color': '#008080'}) text = '' for tag in tags: text = text + ' ' + tag.get_text().strip() return text
def request_by_urllib(self, url: str, func, self_rotation=5, timeout=5, data=None, from_encoding="utf-8"): """ 使用urllib请求连接获取数据,并调用自定义函数处理数据 :param url: :param func: :param self_rotation: :param timeout: :param data: :param from_encoding: :return: """ result = spider_util.open_url_return_bsobj(url, self_rotation, timeout, data, from_encoding) self.data.append(func(result))
def query_location2(df: DataFrame): df['号码归属地'] = None length = len(df) for i in range(length): format_tel = df.at[i, '格式化电话'] if format_tel is None or format_tel == '': continue url = 'https://www.00cha.com/114.asp?t=' + format_tel bsObj = spider_util.open_url_return_bsobj( url, 5, 20, from_encoding='gbk' ) # 20秒超时 对于申明编码为gb2312但使用了gbk中的字符时,BeautifulSoup会把编码识别为windows-1252 tags = bsObj.find_all('font', {'size': 4}) if tags is None: tags = bsObj.find_all('font', {'color': '#008080'}) text = None for tag in tags: text = text + ' ' + tag.get_text().strip() if format_tel.startswith('0769'): text = '广东 东莞' df.at[i, '号码归属地'] = text spider_util.log_progress(i, length)
def get_list(): data = [] r = RedisUtil().get_redis_instance() for i in range(1, 2000): url = 'https://search.51job.com/list/040000,000000,0000,00,9,99,%2520,2,{page}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format( page=i) # head = ['职位名', '公司名', '工作地点', '薪资', '发布时间', '职位详细URL'] bsobj = spider_util.open_url_return_bsobj(url) div_tags = bsobj.select('#resultList .el')[1:] for div in div_tags: job = div.select_one('a').get_text().strip() job_url = div.select_one('a').get('href') redis_result = r.sadd(redis_job_set, job_url) if redis_result == 0: # 结果为0,则添加失败,说明已经有该职位url信息 continue company = div.select_one('.t2 a').get_text().strip() address = div.select_one('.t3').get_text().strip() salary = div.select_one('.t4').get_text().strip() money_toplimit = None money_lowerlimit = None money_unit = None time_unit = None if '/' in salary: money_range = salary.split('/')[0] money_unit = money_range[-1] money_range = money_range[:-1] money_toplimit = money_range money_lowerlimit = money_range if '-' in money_range: # 分割薪水上下限 money_lowerlimit = money_range.split('-')[0] money_toplimit = money_range.split('-')[1] time_unit = salary.split('/')[1] push_time = div.select_one('.t5').get_text().strip() item = {'职位名': job, '公司名': company, '工作地点': address, '薪资': salary, '发布时间': push_time, '职位详细URL': job_url, '金额上限': money_toplimit, '金额下限': money_lowerlimit, '时间单位': time_unit, '金额单位': money_unit} data.append(item) spider_util.log_progress(i,2000,start_from_zero=False) return DataFrame(data)