Esempio n. 1
0
 def test_add_or_replace_parameters(self):
     url = "http://domain/test"
     self.assertEqual(add_or_replace_parameters(url, {"arg": "v"}),
                      "http://domain/test?arg=v")
     url = "http://domain/test?arg1=v1&arg2=v2&arg3=v3"
     self.assertEqual(
         add_or_replace_parameters(url, {"arg4": "v4"}),
         "http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4",
     )
     self.assertEqual(
         add_or_replace_parameters(url, {
             "arg4": "v4",
             "arg3": "v3new"
         }),
         "http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4",
     )
     url = "http://domain/test?arg1=v1&arg2=v2&arg1=v3"
     self.assertEqual(
         add_or_replace_parameters(url, {"arg4": "v4"}),
         "http://domain/test?arg1=v1&arg2=v2&arg1=v3&arg4=v4",
     )
     self.assertEqual(
         add_or_replace_parameters(url, {"arg1": "v3"}),
         "http://domain/test?arg1=v3&arg2=v2",
     )
Esempio n. 2
0
 def test_add_or_replace_parameters(self):
     url = 'http://domain/test'
     self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}),
                      'http://domain/test?arg=v')
     url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
     self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4'}),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
     self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4', 'arg3': 'v3new'}),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4')
Esempio n. 3
0
 def test_add_or_replace_parameters(self):
     url = 'http://domain/test'
     self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}),
                      'http://domain/test?arg=v')
     url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
     self.assertEqual(add_or_replace_parameters(url, {'arg4': 'v4'}),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
     self.assertEqual(
         add_or_replace_parameters(url, {
             'arg4': 'v4',
             'arg3': 'v3new'
         }), 'http://domain/test?arg1=v1&arg2=v2&arg3=v3new&arg4=v4')
Esempio n. 4
0
def get_weather(url_params):
    db_hook = SqliteHook(conn_name_attr='sqlite_default ')
    api_hook = HttpHook(http_conn_id='http_default', method='GET')

    url = add_or_replace_parameters(f'v1/history/daily', url_params)

    resp = api_hook.run(url)
    data = resp.json()['data']

    # usually I don't really care about this, but in case of big data, I guess it may be very useful
    del resp

    weather_insert = """
    insert or ignore into weather 
    (station_id, record_date, temperature, 
    temperature_min, temperature_max, winddirection, windspeed, sunshine, pressure) 
    values (?, ?, ?, ?, ?, ?, ?, ?, ?);
    """

    for day in data:
        db_hook.run(weather_insert,
                    parameters=(url_params['station'], day['date'],
                                day['temperature'], day['temperature_min'],
                                day['temperature_max'], day['winddirection'],
                                day['windspeed'], day['sunshine'],
                                day['pressure']))
    def start_requests(self):

        first_url = 'https://www.amazon.com.br/s'

        pars = {'k': self.query, 'i': self.category}

        yield Request(add_or_replace_parameters(first_url, pars),
                      callback=self.parse)
Esempio n. 6
0
 def start_requests(self):
     for url in self.start_urls:
         # self.current_brand = url.split('item.rakuten.co.jp/')[1].split('/')[0]
         # yield SplashRequest(url, endpoint='render.html', callback=self.parse)
         for i in range(1, 100, 2):
             params = {
                 "max": i + 1,
                 "min": i,
             }
             search_url = add_or_replace_parameters(url, params)
             yield scrapy.Request(search_url, callback=self.parse_page)
Esempio n. 7
0
    def handle(self, *args, **options):
        final_dictionary = {}

        for year in range(options['year'], options['year'] + 1):
            final_dictionary[year] = {'Name': year, 'Childs': {}}
            year_lookup = {'year': year}

            for make_id, _ in get_dict(year_lookup, final_dictionary):
                make_lookup = year_lookup.copy()
                make_lookup['make'] = make_id

                for model_id, _ in get_dict(make_lookup, final_dictionary):
                    model_lookup = make_lookup.copy()
                    model_lookup['model'] = model_id

                    for part_id, _ in get_dict(model_lookup, final_dictionary):
                        part_lookup = model_lookup.copy()
                        part_lookup['partname'] = part_id
                        part_lookup['part'] = part_id[:4]

                        for fitment_id, make in get_dict(
                                part_lookup, final_dictionary):
                            nav_lookup = part_lookup.copy()
                            nav_lookup['cnsuffix'] = fitment_id

                            fitment_lookup = part_lookup.copy()
                            fitment_lookup.pop('part')
                            fitment_lookup['make'] = make
                            fitment_lookup['cnsuffix'] = fitment_id

                            url = add_or_replace_parameters(
                                'https://www.buyautoparts.com/partsearch/newfitmentajax.asp',
                                fitment_lookup)
                            soup = BeautifulSoup(requests.get(
                                url, headers=headers).text,
                                                 features="html.parser")

                            current_dict = current(nav_lookup,
                                                   final_dictionary)
                            try:
                                current_dict['Link'] = soup.text
                                fill_table(current_dict['Link'], fitment_id)
                            except Exception as e:
                                errors_lookups.append(nav_lookup)
        if errors_lookups:
            json_text = json.dumps(errors_lookups, indent=4)
            print(json_text)
            with open('error_lookups.json', 'w') as error_file:
                error_file.writelines(json_text)
Esempio n. 8
0
class AnJuKeListSpider(Spider):
    name = "anjuke_list_spider"

    start_urls = [
        add_or_replace_parameters(NEWS_LIST_URL, {
            PAGE_KEY: 1,
            TYPE_KEY: 2
        })
    ]

    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'authority': 'sh.news.anjuke.com',
            'accept': '*/*',
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
            'x-requested-with': 'XMLHttpRequest',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'cors',
            'sec-fetch-dest': 'empty',
            'referer': 'https://sh.news.anjuke.com/?from=navigation',
            'accept-language': 'zh-CN,zh;q=0.9',
        },
        'DOWNLOAD_DELAY': 2
    }

    def parse(self, response):

        result = json.loads(response.text)
        news_list = result['list']
        release_time = None
        if news_list:
            for news_item in news_list:
                loader = ItemLoader(item=NewsItem())
                loader.add_value('release_time', news_item['time'])
                loader.add_value('title', news_item['title'])
                loader.add_value('url', news_item['url'])
                loader.add_value('thumbnail', news_item['image_url'])
                item = loader.load_item()
                release_time = item['release_time']
                print(item)
            else:
                if isinstance(release_time, datetime) \
                        and (datetime.now() - release_time).days < settings['MAX_CRAWLED_DAYS']:
                    page = url_query_parameter(response.url, PAGE_KEY)
                    page = int(page) + 1
                    next_page_url = add_or_replace_parameters(
                        response.url, {PAGE_KEY: page})
Esempio n. 9
0
    def parse(self, response):

        result = json.loads(response.text)
        news_list = result['list']
        release_time = None
        if news_list:
            for news_item in news_list:
                loader = ItemLoader(item=NewsItem())
                loader.add_value('release_time', news_item['time'])
                loader.add_value('title', news_item['title'])
                loader.add_value('url', news_item['url'])
                loader.add_value('thumbnail', news_item['image_url'])
                item = loader.load_item()
                release_time = item['release_time']
                print(item)
            else:
                if isinstance(release_time, datetime) \
                        and (datetime.now() - release_time).days < settings['MAX_CRAWLED_DAYS']:
                    page = url_query_parameter(response.url, PAGE_KEY)
                    page = int(page) + 1
                    next_page_url = add_or_replace_parameters(
                        response.url, {PAGE_KEY: page})
def get_dict(lookup, final_dictionary):
    url = add_or_replace_parameters(
        'https://www.buyautoparts.com/partsearch/homegetymmdetails.asp',
        lookup)
    soup = BeautifulSoup(requests.get(url, headers=headers).text,
                         features="html.parser")

    options = soup.find_all('option')
    current_dict = current(lookup, final_dictionary)
    if options:
        reqmake = re.findall(r"var reqmake =  '(.*)'",
                             soup.find('script').string)
        for option in options:
            value = option.get('value')
            if value:
                current_dict[value] = {'Name': option.text, 'Childs': {}}
                yield value, reqmake[0] if reqmake else None
    else:
        try:
            current_dict['Link'] = re.findall(r"0 ~(.*)~", soup.text)[0]
            fill_table(current_dict['Link'])
        except Exception as e:
            errors_lookups.append(lookup)
Esempio n. 11
0
 def test_add_or_replace_parameters_does_not_change_input_param(self):
     url = "http://domain/test?arg=original"
     input_param = {"arg": "value"}
     add_or_replace_parameters(url, input_param)  # noqa
     self.assertEqual(input_param, {"arg": "value"})
Esempio n. 12
0
 def test_add_or_replace_parameters_does_not_change_input_param(self):
     url = 'http://domain/test?arg=original'
     input_param = {'arg': 'value'}
     new_url = add_or_replace_parameters(url, input_param)  # noqa
     self.assertEqual(input_param, {'arg': 'value'})