def test_add_or_replace_parameters(self): url = "http://domain/test" self.assertEqual(add_or_replace_parameters(url, {"arg": "v"}), "http://domain/test?arg=v") url = "http://domain/test?arg1=v1&arg2=v2&arg3=v3" self.assertEqual( add_or_replace_parameters(url, {"arg4": "v4"}), "http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4", ) self.assertEqual( add_or_replace_parameters(url, { "arg4": "v4", "arg3": "v3new" }), "http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4", ) url = "http://domain/test?arg1=v1&arg2=v2&arg1=v3" self.assertEqual( add_or_replace_parameters(url, {"arg4": "v4"}), "http://domain/test?arg1=v1&arg2=v2&arg1=v3&arg4=v4", ) self.assertEqual( add_or_replace_parameters(url, {"arg1": "v3"}), "http://domain/test?arg1=v3&arg2=v2", )
def test_add_or_replace_parameters(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4'}), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4', 'arg3': 'v3new'}), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4')
def test_add_or_replace_parameters(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}), 'http://domain/test?arg=v') url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3' self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4'}), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4') self.assertEqual( add_or_replace_parameters(url, { 'arg4': 'v4', 'arg3': 'v3new' }), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4')
def get_weather(url_params): db_hook = SqliteHook(conn_name_attr='sqlite_default ') api_hook = HttpHook(http_conn_id='http_default', method='GET') url = add_or_replace_parameters(f'v1/history/daily', url_params) resp = api_hook.run(url) data = resp.json()['data'] # usually I don't really care about this, but in case of big data, I guess it may be very useful del resp weather_insert = """ insert or ignore into weather (station_id, record_date, temperature, temperature_min, temperature_max, winddirection, windspeed, sunshine, pressure) values (?, ?, ?, ?, ?, ?, ?, ?, ?); """ for day in data: db_hook.run(weather_insert, parameters=(url_params['station'], day['date'], day['temperature'], day['temperature_min'], day['temperature_max'], day['winddirection'], day['windspeed'], day['sunshine'], day['pressure']))
def start_requests(self): first_url = 'https://www.amazon.com.br/s' pars = {'k': self.query, 'i': self.category} yield Request(add_or_replace_parameters(first_url, pars), callback=self.parse)
def start_requests(self): for url in self.start_urls: # self.current_brand = url.split('item.rakuten.co.jp/')[1].split('/')[0] # yield SplashRequest(url, endpoint='render.html', callback=self.parse) for i in range(1, 100, 2): params = { "max": i + 1, "min": i, } search_url = add_or_replace_parameters(url, params) yield scrapy.Request(search_url, callback=self.parse_page)
def handle(self, *args, **options): final_dictionary = {} for year in range(options['year'], options['year'] + 1): final_dictionary[year] = {'Name': year, 'Childs': {}} year_lookup = {'year': year} for make_id, _ in get_dict(year_lookup, final_dictionary): make_lookup = year_lookup.copy() make_lookup['make'] = make_id for model_id, _ in get_dict(make_lookup, final_dictionary): model_lookup = make_lookup.copy() model_lookup['model'] = model_id for part_id, _ in get_dict(model_lookup, final_dictionary): part_lookup = model_lookup.copy() part_lookup['partname'] = part_id part_lookup['part'] = part_id[:4] for fitment_id, make in get_dict( part_lookup, final_dictionary): nav_lookup = part_lookup.copy() nav_lookup['cnsuffix'] = fitment_id fitment_lookup = part_lookup.copy() fitment_lookup.pop('part') fitment_lookup['make'] = make fitment_lookup['cnsuffix'] = fitment_id url = add_or_replace_parameters( 'https://www.buyautoparts.com/partsearch/newfitmentajax.asp', fitment_lookup) soup = BeautifulSoup(requests.get( url, headers=headers).text, features="html.parser") current_dict = current(nav_lookup, final_dictionary) try: current_dict['Link'] = soup.text fill_table(current_dict['Link'], fitment_id) except Exception as e: errors_lookups.append(nav_lookup) if errors_lookups: json_text = json.dumps(errors_lookups, indent=4) print(json_text) with open('error_lookups.json', 'w') as error_file: error_file.writelines(json_text)
class AnJuKeListSpider(Spider): name = "anjuke_list_spider" start_urls = [ add_or_replace_parameters(NEWS_LIST_URL, { PAGE_KEY: 1, TYPE_KEY: 2 }) ] custom_settings = { 'DEFAULT_REQUEST_HEADERS': { 'authority': 'sh.news.anjuke.com', 'accept': '*/*', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'sec-fetch-dest': 'empty', 'referer': 'https://sh.news.anjuke.com/?from=navigation', 'accept-language': 'zh-CN,zh;q=0.9', }, 'DOWNLOAD_DELAY': 2 } def parse(self, response): result = json.loads(response.text) news_list = result['list'] release_time = None if news_list: for news_item in news_list: loader = ItemLoader(item=NewsItem()) loader.add_value('release_time', news_item['time']) loader.add_value('title', news_item['title']) loader.add_value('url', news_item['url']) loader.add_value('thumbnail', news_item['image_url']) item = loader.load_item() release_time = item['release_time'] print(item) else: if isinstance(release_time, datetime) \ and (datetime.now() - release_time).days < settings['MAX_CRAWLED_DAYS']: page = url_query_parameter(response.url, PAGE_KEY) page = int(page) + 1 next_page_url = add_or_replace_parameters( response.url, {PAGE_KEY: page})
def parse(self, response): result = json.loads(response.text) news_list = result['list'] release_time = None if news_list: for news_item in news_list: loader = ItemLoader(item=NewsItem()) loader.add_value('release_time', news_item['time']) loader.add_value('title', news_item['title']) loader.add_value('url', news_item['url']) loader.add_value('thumbnail', news_item['image_url']) item = loader.load_item() release_time = item['release_time'] print(item) else: if isinstance(release_time, datetime) \ and (datetime.now() - release_time).days < settings['MAX_CRAWLED_DAYS']: page = url_query_parameter(response.url, PAGE_KEY) page = int(page) + 1 next_page_url = add_or_replace_parameters( response.url, {PAGE_KEY: page})
def get_dict(lookup, final_dictionary): url = add_or_replace_parameters( 'https://www.buyautoparts.com/partsearch/homegetymmdetails.asp', lookup) soup = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser") options = soup.find_all('option') current_dict = current(lookup, final_dictionary) if options: reqmake = re.findall(r"var reqmake = '(.*)'", soup.find('script').string) for option in options: value = option.get('value') if value: current_dict[value] = {'Name': option.text, 'Childs': {}} yield value, reqmake[0] if reqmake else None else: try: current_dict['Link'] = re.findall(r"0 ~(.*)~", soup.text)[0] fill_table(current_dict['Link']) except Exception as e: errors_lookups.append(lookup)
def test_add_or_replace_parameters_does_not_change_input_param(self): url = "http://domain/test?arg=original" input_param = {"arg": "value"} add_or_replace_parameters(url, input_param) # noqa self.assertEqual(input_param, {"arg": "value"})
def test_add_or_replace_parameters_does_not_change_input_param(self): url = 'http://domain/test?arg=original' input_param = {'arg': 'value'} new_url = add_or_replace_parameters(url, input_param) # noqa self.assertEqual(input_param, {'arg': 'value'})