def get_data(title, scope, page_size=20, page_num=0): """获取数据 :param title: 主题 :param scope: 范围 :param page_size: 返回一条是数据,默认是20,最大是20 :param page_num: 页数 :return: 主题信息集合 """ global _total_data ak = choice(AK) # 随机选取一个Ak值.AK要在百度接口上获取 address = f'http://api.map.baidu.com/place/v2/search?query={title}®ion={scope}&output=json&ak={ak}&page_size=' \ f'{page_size}&page_num={page_num}' r = requests.get(address, headers={'User-Agent': random()}) json = r.json() status = json.get('status') if status == 401: get_data(title, scope, page_num=page_num) return None elif str(status).startswith('3'): raise BaiDuMapError('该模块已经废弃不可用') print(json) results = json['results'] # 获取主要信息 total = json['total'] # 获取总条数 page = ceil(total / page_size) # 取上整,获取页数 current_page = page_num + 1 # 当前页数,应该等于上一页加1 for result in results: name = result['name'] locations = result['location'] location = str(locations['lng']) + '|' + str(locations['lat']) address = result.get('province') + result.get('city') + result.get('area') + result.get('address') _total_data.append(Data(name, location, address)) if current_page < page: get_data(title, scope, page_num=current_page)
def movie(name): param = dict(searchword=name.encode('gb2312')) response = requests.post(url + 'search.asp', data=param, headers={'User-Agent': random()}) response.encoding = 'GBK' data = response.text find, = re.findall(pattern=r'<div class="list mb">(.+)</div>', string=data, flags=re.S) message = re.findall(pattern='a href="(.+)" title="(.+)" class', string=find) return message
def Load_BaiDuBaiKe(name): """下载百度百科里面的内容信息 :param name: 百科百科名字 :return: 百度百科的文本信息 """ url = F'https://baike.baidu.com/item/{quote(name)}' response = requests.get(url, headers={'User-Agent': random()}) data = response.content.decode('utf-8') return data
def __init__(self, url, my_app_id, my_app_secret): if not os.path.exists(rec_tmp_dir): os.mkdir(rec_tmp_dir) self.url = url self.my_appId = my_app_id self.my_appSecret = my_app_secret body["showapi_appid"] = my_app_id body["showapi_sign"] = my_app_secret headers["User-Agent"] = random()
def get_7day_weather(self): """获得7天 天气预报""" total = [] for code in self.codes: url = F'http://www.weather.com.cn/weather/{code}.shtml' response = requests.get(url, headers={'User-Agent': random()}) response.encoding = 'utf-8' gd = _GetData() gd.feed(response.text) total.append({gd.addr: gd.data_7d}) return total
def fetch_gitee(package, name, project='logo'): """爬取非结构文本数据 :param package: 包名 :param name: 文件名 :param project: 项目名 :return: 返回字节数据 """ url = GITEE.format(project=project, package=package, name=name) response = requests.get(url=url, headers={'User-Agent': random()}) # 动态增加UA content = response.content return content
def get_15day_weather(self): """获得15天 天气预报""" t_7d = self.get_7day_weather() for code in self.codes: url = F'http://www.weather.com.cn/weather15d/{code}.shtml' response = requests.get(url, headers={'User-Agent': random()}) response.encoding = 'utf-8' gd = _GetData() gd.feed(response.text) for t in t_7d: if gd.addr in t: t[gd.addr].extend(gd.data_7d) return t_7d
def get_today_weather(self): """获得当天 天气预报""" total = [] for code in self.codes: url = F'http://www.weather.com.cn/weather1d/{code}.shtml' response = requests.get(url, headers={'User-Agent': random()}) response.encoding = 'utf-8' data = response.text d = re.findall( r'<input type="hidden" id="hidden_title" value="(.+)" />', data) gd = _GetData() gd.feed(data) total.append({gd.addr: d}) return total
def fetch_gitee(package, name, project='logo'): """ 爬取非结构文本数据 :param package: 包名 :param name: 文件名 :param project: 项目名 :return: 返回字节数据 """ url = GITEE.format(project=project, package=package, name=name) response = requests.get(url=url, headers={'User-Agent': random()}) # 动态增加UA content = response.content # if pil: # byte = io.BytesIO(content) # 转成字节流 # return Image.open(fp=byte) return content
def __init__(self, search): """初始化,也就是模糊搜索的第一步 :param search: 关键字 """ self.header = {'User-Agent': random()} # 设置UA # 将中文字转化为URL链接。注意搜狗将中文字进行的GBK编码。而不是UTF-8 url_word = quote(search.encode('GBK')) url = 'https://pinyin.sogou.com/dict/search/search_list/%s/normal/' % url_word # 搜索链接 response = requests.get(url=url, headers=self.header) match = re.findall(url[24:] + '(.{1,3})">', response.text) # 匹配下载页数 max_page = max(map(lambda x: int(x), match)) if match else 1 # 选取最大的页数,如果没有页数返回1 m = [] # 将匹配到下载链接 for page in range(1, max_page + 1): response = requests.get(url=url + str(page), headers=self.header) match = re.findall(r'id=(.+)&name=(.+)"', response.text) # 匹配下载链接 m.extend(match) # 将匹配到的下载链接装到链表中 load_url = 'https://pinyin.sogou.com/d/dict/download_cell.php?id={0}&name={1}' # 下载链接的格式 # 将匹配到的,名字和ID映射到下载链接格式中 self.load_url = map(lambda x: load_url.format(x[0], x[1]), m)