async def fetch_blog_post_published_count(keyword, start_date: datetime.date, end_date: datetime.date): """ 네이버 블로그 월 발행량 가져오기(기간별) """ headers = { 'authority': 'section.blog.naver.com', 'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"', 'accept': 'application/json, text/plain, */*', 'sec-ch-ua-mobile': '?0', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'referer': 'https://section.blog.naver.com/Search/Post.nhn?pageNo=1&rangeType=ALL&orderBy=sim&keyword=%ED%8F%BC%ED%81%B4%EB%A0%8C%EC%A7%95', 'accept-language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', } params = { 'countPerPage': '7', 'currentPage': '1', 'keyword': keyword, 'orderBy': 'sim', 'type': 'post', 'startDate': start_date.isoformat(), 'endDate': end_date.isoformat(), } async with aiohttp.ClientSession() as session: async with session.get('https://section.blog.naver.com/ajax/SearchList.nhn', headers=headers, params=params) as response: html = await response.text() # 월 발행량 raw = html[6:] # 앞에 쓸데없는 뭐가 많음 res = json.loads(raw) return int(res['result']['totalCount'])
async def fetch_category_shopping_trending_keywords(category_id, start_date: datetime.date, end_date: datetime.date): headers = { 'authority': 'datalab.naver.com', 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', 'accept': '*/*', 'x-requested-with': 'XMLHttpRequest', 'sec-ch-ua-mobile': '?0', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'origin': 'https://datalab.naver.com', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'referer': 'https://datalab.naver.com/shoppingInsight/sCategory.naver', 'accept-language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', } data = { 'cid': category_id, 'timeUnit': 'date', 'startDate': start_date.isoformat(), 'endDate': end_date.isoformat(), 'age': '', 'gender': '', 'device': '', 'page': '1', 'count': '100' } response = requests.post( 'https://datalab.naver.com/shoppingInsight/getCategoryKeywordRank.naver', headers=headers, data=data) # print(response.text) data = response.json()['ranks'] return data
async def fetch_keyword_graph_statistics(keyword, category_id, time_unit: TimeUnit, start_date: datetime.date, end_date: datetime.date): '''그래프 그리기용''' headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'referer': 'https://datalab.naver.com/shoppingInsight/sKeyword.naver', 'origin': 'https://datalab.naver.com' } data = { 'cid': category_id, # category id 'timeUnit': time_unit, # time unit date/week/month 'startDate': start_date.isoformat(), 'endDate': end_date.isoformat(), # 'age': '30,40', # 10/20/30/40/50/60 in commas 'age': '', # 'gender': 'f', # f or f,m or m 'gender': '', # 'device': 'pc', # '' 'pc' 'mo' 'pc,mo' 'device': '', 'keyword': keyword } statistics = {} async with aiohttp.ClientSession() as session: async with session.post('https://datalab.naver.com/shoppingInsight/getKeywordClickTrend.naver', headers=headers, data=data) as response: statistics['clickTrend'] = (await response.json(content_type=None))['result'][0]['data'] # async with session.post('https://datalab.naver.com/shoppingInsight/getKeywordDeviceRate.naver', headers=headers, data=data) as response: # statistics['deviceRate'] = (await response.json(content_type=None))['result'][0]['data'] async with session.post('https://datalab.naver.com/shoppingInsight/getKeywordGenderRate.naver', headers=headers, data=data) as response: statistics['genderRate'] = (await response.json(content_type=None))['result'][0]['data'] async with session.post('https://datalab.naver.com/shoppingInsight/getKeywordAgeRate.naver', headers=headers, data=data) as response: statistics['ageRate'] = (await response.json(content_type=None))['result'][0]['data'] return statistics
async def fetch_relative_ratio(keywords: List[str], start_date: datetime.date, end_date: datetime.date, time_unit: TimeUnit): '''get relative ratio''' headers = { 'X-Naver-Client-Id': '8VcP69maRqven9qJWV1b', 'X-Naver-Client-Secret': 'BH21bBnIJz' } body = { 'startDate': start_date.isoformat(), # 전월부터 'endDate': end_date.isoformat(), # 이번달까지 'timeUnit': time_unit.value, 'keywordGroups': list( map(lambda keyword: { 'groupName': keyword, 'keywords': [keyword] }, keywords)) } ''' "ratio": [ { "data": [ { "period": "2021-05-01", "ratio": 93.98417 }, { "period": "2021-06-01", "ratio": 100 } ], "keywords": [ "샴푸" ], "title": "샴푸" } ]''' req = requests.post("https://openapi.naver.com/v1/datalab/search", headers=headers, data=json.dumps(body)) raw = req.json()['results'] result = [{'keyword': d['keywords'][0], 'data': d['data']} for d in raw] return result
def list_feriados(dataInicial: datetime.date, dataFinal: datetime.date): # Instancia o cliente do banco de dados NOSQL GCloud DataStore ds = get_client() # Prepara a query para consultar valores do índice IPCA query = ds.query(kind=TipoEntidade.FERIADOS.value) # Inclui filtros da consulta query.add_filter('dt_feriado', '>=', dataInicial.isoformat()) query.add_filter('dt_feriado', '<=', dataFinal.isoformat()) #Define ordenação da consulta query.order = ['dt_feriado'] # Executa a consulta e armazena num dictionary feriados = query.fetch() # Trata os formatos retornados da lista de entidades feriados = builtin_list(map(from_datastore, feriados)) return feriados
def _do_validate(self, validated_date: datetime.date) -> bool: if validated_date: try: datetime.strptime(validated_date.isoformat(), self.__DATE_PATTERN) except ValueError: return False return True
def crawl_library_group_room_availability(session: WebSession, date: datetime.date): page = session.get_broken_simplified_soup(urls.LIBRARY_GROUP_ROOMS, post_data={ 'submit:reservas:es': 'Ver+disponibilidade', 'data': date.isoformat() }) return parser.get_library_group_room_availability(page)
def list_indices(indexador: str, dataInicial: datetime.date, dataFinal: datetime.date): # Instancia o cliente do banco de dados NOSQL GCloud DataStore ds = get_client() # Prepara a query para consultar valores do índice IPCA query = ds.query(kind=TipoEntidade.INDICES.value) # Inclui filtros da consulta query.add_filter('tp_indice', '=', indexador) query.add_filter('dt_referencia', '>=', dataInicial.isoformat()) query.add_filter('dt_referencia', '<=', dataFinal.isoformat()) #Define ordenação da consulta query.order = ['dt_referencia'] # Executa a consulta e armazena num dictionary # indices = list(query.fetch()) indices = query.fetch() # Trata os formatos retornados da lista de entidades # indices = list(map(lambda e: _tratar_formatos(e), indices)) indices = builtin_list(map(from_datastore, indices)) return indices
def remove_old_ts(self, output_dir, before_date: datetime.date): before_date += timedelta(1) old_houses = HouseTS.objects.filter( created__lt=before_date ) total_house = old_houses.count() total_done = 0 done_this_ite = 1 self.stdout.ending = '' self.stdout.write("[HouseTS] Start to backup {} rows before {}.\n".format( total_house, before_date.isoformat() )) while done_this_ite: old_houses = HouseTS.objects.filter( created__lt=before_date )[:self.items_per_page] done_this_ite = 0 for house in old_houses: sub_dir = '{}/{:04d}/{:02d}/{:02d}'.format( self.ts_dir, house.created.year, house.created.month, house.created.day ) filename = 'house.{}.{}.json'.format(house.vendor.name, house.vendor_house_id) self.dump_row( base_dir=output_dir, sub_dir=sub_dir, filename=filename, house=house ) house.delete() total_done += 1 done_this_ite += 1 self.stdout.write("\r[HouseTS] {:3.0f}% done".format( 100 * total_done / total_house )) self.stdout.write("\n[HouseTS] done!\n") self.stdout.ending = '\n'
def list_indexadores(dt_referencia: datetime.date = None, tipo_atualizacao: str = None): # Instancia o cliente do banco de dados NOSQL GCloud DataStore ds = get_client() # Prepara a query para consultar valores do índice IPCA query = ds.query(kind=TipoEntidade.INDEXADORES.value) # Inclui filtros da consulta caso passados if dt_referencia is not None: query.add_filter('dt_ult_referencia', '<', dt_referencia.isoformat()) if tipo_atualizacao is not None: query.add_filter('tipo_atualizacao', '=', tipo_atualizacao) #Define ordenação da consulta query.order = ['dt_ult_referencia'] # Executa a consulta e armazena num dictionary #lista = list(query.fetch()) lista = query.fetch() entities = builtin_list(map(from_datastore, lista)) return entities