def _get_comment_svg(url,svg=None): if svg is None: resp = send_http( requests.Session(), 'get', url, retries=-1, headers=CSS_HEADERS ) svg = resp[0].text if svg: res = {} text = bs(svg,'lxml') text_path = text('textpath') if not text_path: texts = text('text') ys = {i['y']: i.text for i in texts if i} return ys,svg else: path = text('path') for _,i in enumerate(path): d = i['d'] num = int(d.split(' ')[1].strip()) string = text_path[_].text.strip() res[num]=string return res,svg
def fetch_map_page(self, data): result = send_http(self.session, 'post', API_MAP_SEARCH, retries=MAX_RETRY, proxy=self.proxy, headers=self.map_headers, timeout=TIMEOUT, data=data, kind='JSON') if result: response, self.proxy, self.map_headers = result page_data = response.json() return page_data
def get(self,headers=HEADERS,proxy=None): result = send_http(self.session, 'get', self.url, retries=MAX_RETRY, headers=headers, proxy=proxy, timeout=TIMEOUT, kind='SHOP', ) if result: response, self.proxy, self.headers = result self.homepage = response.text self._fetched = True logger.info(f'成功获取店铺:{self.id} 首页.')
def get_map(self,headers=HEADERS,proxy=None): url = API_CITY_MAP.format(id=self.id) result = send_http(self.session, 'get', url, retries=MAX_RETRY, headers=headers, proxy=proxy, timeout=TIMEOUT, kind='MAP', ) if result: response, self.proxy, self.headers = result self.map_page = response.text logger.info(f'获取 “{self.city}” 地图搜索页成功')
def get(self, url=None, headers=LOGIN_HEADERS, proxy=None): _url = url if url else self.home_url result = send_http(self.session, 'get', _url, retries=MAX_RETRY, headers=headers, proxy=proxy, timeout=TIMEOUT, kind='SHOP') if result: response, self.proxy, self.headers = result self.homepage = response.text logger.info(f'成功获取店铺:{self.id} 点评相关页.') else: self.homepage = None
def get_shop_css(self, reget=False): src = from_pattern(PATTERN_CSS,self.homepage) if src: url = '//'.join([CSS_URL_PREFIX,src]) result = send_http(self.session, 'get', url, retries=MAX_RETRY, headers=self.css_headers, proxy=self.css_proxy, timeout=TIMEOUT, kind='CSS', ) if result: response, self.css_proxy, self.css_headers = result self.css = response.text return self.css
def fetch_map_page(self, data): self.map_headers['Referer'] = SEARCH_MAP_POST_REFERER \ .format(data["cityId"], data["regionId"], quote(data["keyword"])) result = send_http(self.session, 'post', API_MAP_SEARCH, retries=MAX_RETRY, proxy=self.proxy, headers=self.map_headers, timeout=TIMEOUT, data=data, kind='JSON' ) if result: response, self.proxy, self.map_headers = result page_data = response.json() return page_data
def get_hot(self): """ 获取当前城市的搜索热度前十关键词 :return:[{'子标签': '8', '索引': '0', '主分类id': '', '数据类型': '3000', 'id_': '587192', '关键词': '三里屯'},..] """ url = API_CITY_HOT.format(id=self.id) result = send_http(self.session, 'get', url, headers=self.headers, retries=MAX_RETRY, kind='JSON', proxy=self.proxy) if result: response, self.proxy, _ = result data = response.json() self._hot = [i['valueMap'] for i in data['recordList']] return self._hot
def start_request(self): headers = SHOP_INFO_HEADERS headers['Referer'] = "http://www.dianping.com/shop/{}".format( self.shopId) result = send_http( self.session, 'get', self.url, retries=MAX_RETRY, headers=headers, timeout=TIMEOUT, _token=self.token.new(), kind='SHOP', ) if result: response, _, _ = result self.homepage = response.json() self._fetched = True logger.info(f'成功获取店铺:{self.shopId} 详情.')
def get(self,headers=HEADERS,proxy=None): """ 抓取当前城市首页 :param proxy:使用的代理 :param headers:伪造的请求头部 """ result = send_http(self.session, 'get', self.url, retries=MAX_RETRY, headers=headers, proxy=proxy, timeout=TIMEOUT, kind='CITY', ) if result: response, self.proxy, self.headers = result self.homepage = response.text logger.info(f'获取 “{self.city}” 首页成功.')
def get_relative(self, keyword): """ 返回关键词相关的搜索结果和结果数 :param keyword:关键词 :return:{相关结果:数量,..} """ url = API_KEY_RELATIVE.format(id=self.id, key=keyword) result = send_http(self.session, 'get', url, headers=self.headers, retries=MAX_RETRY, kind='JSON', proxy=self.proxy) if result: response, self.proxy, _ = result data = response.json() res = {i.split('|')[0]:from_pattern(PATTERN_NUMS, i.split('|')[-2]) for i in data['msg']['shop']} return res
def _get_num_svg(url): resp = send_http(requests.Session(), 'get', url, retries=-1, headers=CSS_HEADERS) if resp: text = bs(resp[0].text, 'lxml') texts = text('text') if not texts: res = {} text_path = text('textpath') path = text('path') for _, i in enumerate(path): d = i['d'] num = int(d.split(' ')[1].strip()) string = text_path[_].text.strip() res[num] = string return res else: ys = {i['y']: i.text for i in texts if i} return ys
def get_city_list(url,headers=HEADERS,proxy=None): result = send_http(requests.session(), 'get', url, retries=-1, proxy=proxy, headers=headers, timeout=TIMEOUT, kind='CITY_LIST', ) if result: text = result[0].text ul = get_sub_tag(text,'city_list') if ul: with open(CITY_LIST_FILE_PATH,'w') as f: res = {} lis = ul('li') for li in lis: _as = li('a') for a in _as: res[a.text] = CITY_URL_PREFIX+a['href'] f.write(json.dumps(res)) return res