Ejemplo n.º 1
0
 def start_requests(self):
     urls = ['https://api.github.com/users/{}/starred'.format(name) for name in usernames]
     for url in urls:
         req = JsonRequest(url=url, callback=self.parse_stars)
         # API call, don't need to check robots
         req.meta['dont_obey_robotstxt'] = True
         yield req
Ejemplo n.º 2
0
 def test_request_class(self):
     r1 = FormRequest("http://www.example.com")
     self._assert_serializes_ok(r1, spider=self.spider)
     r2 = CustomRequest("http://www.example.com")
     self._assert_serializes_ok(r2, spider=self.spider)
     r3 = JsonRequest("http://www.example.com", dumps_kwargs={"indent": 4})
     self._assert_serializes_ok(r3, spider=self.spider)
Ejemplo n.º 3
0
 def start_requests(self):
     yield FormRequest(self.start_url,
                       callback=self.parse_response,
                       formdata=self.data)
     yield JsonRequest(self.start_url,
                       callback=self.parse_response,
                       data=self.data)
Ejemplo n.º 4
0
 def get_token(self):
     headers = {
         'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
     }
     url = 'https://api.twitter.com/1.1/guest/activate.json'
     yield JsonRequest(url=url, method='POST', headers=headers, callback=self.parse_token)
Ejemplo n.º 5
0
 def start_requests(self):
     print('\n--- Starting crawl of stock: {} ({}) ---\n'.format(
         self.current_stock_info['name'],
         self.current_stock_info['symbol'].upper()))
     yield JsonRequest(url=self.start_url,
                       data=self.start_query_params,
                       dont_filter=True)
Ejemplo n.º 6
0
 def start_requests(self):
     return [
         JsonRequest(url=self.link,
                     data=formdata(),
                     headers=headers,
                     callback=self.parse)
     ]
Ejemplo n.º 7
0
    def parse_product_list(self, response: Response):
        json_data = response.text
        data = json.loads(json_data)[0]['data']['valentino']
        products = data['products']
        edges = products['edges']

        codes = [edge['node']['code'] for edge in edges]

        for code in codes:
            req_body = [{
                "operationName":
                "fetchProductDetail",
                "variables": {
                    "code": code,
                    "breadcrumbFlg": "NO",
                    "platform": "valentino"
                },
                "query":
                "query fetchProductDetail($code: ID, $breadcrumbFlg: breadcrumbFlg, $platform: Platform) {  shop {    productDetail(code: $code, breadcrumbFlg: $breadcrumbFlg, platform: $platform) {      userErrors {        code        message      }      product {        code        title        description        images {          url        }        styleProducts {          code          title          images {            url          }          salePrice {            amount            currencyCode          }          skus {            code            salePrice {              amount              currencyCode            }            options {              code              frontName              values {                frontName                code                images {                  url                }              }            }          }          options {            code            frontName            values {              code              frontName              images {                url              }            }          }        }        salePrice {          amount          currencyCode        }        skus {          code          salePrice {            amount            currencyCode          }          options {            code            frontName            values {              frontName              code              images {                url              }            }          }        }        options {          code          frontName          values {            code            frontName            images {              url            }          }          code        }      }    }  }}"
            }]
            json_body = json.dumps(req_body)
            yield JsonRequest(self.ql_url,
                              method='POST',
                              body=json_body,
                              callback=self.parse_product)
    def start_requests(self):

        busca_url = "https://www.rentfaster.ca/api/map.json"

        dados = {
            "e":
            "zoom_changed",
            "l":
            "12,51.0687,-114.0899",
            "beds":
            ",bachelor",
            "baths":
            "1,1.5,2,2.5,1,1.5,2,2.5,3+",
            "type":
            "Apartment,Condo,Loft,Condo,Loft",
            "price_range_adv[from]":
            "0",
            "price_range_adv[to]":
            "1000",
            "furnishing":
            "Unfurnished",
            "area":
            "51.16417413931004,-113.96716211547852,50.97302850876522,-114.21263788452148"
        }

        yield JsonRequest(busca_url, data=dados, callback=self.parse)
Ejemplo n.º 9
0
 def start_requests(self):
     size = 50
     max_page = 33
     for page in range(1, max_page + 1):
         req_body = [{
             "operationName":
             "fetchProductList",
             "variables": {
                 "input": {
                     "size": size,
                     "page": page,
                     "filters": {
                         'keyword': ''
                     },
                     "breadcrumbFlg": "YES"
                 }
             },
             "query":
             "query fetchProductList($input: ProductFilters) {  valentino {    products(input: $input) {      pageInfo {                totalCount                size                page                hasNextPage      }        edges {             node {                   code        }      }    }  }}"
         }]
         json_body = json.dumps(req_body)
         yield JsonRequest(self.ql_url,
                           method='POST',
                           body=json_body,
                           callback=self.parse_product_list)
Ejemplo n.º 10
0
 def post_item(self, item):
     if '?' not in item.post_link:
         post_link = f'{item.post_link}?json=1'
     else:
         post_link = f'{item.post_link}json=1'
     return JsonRequest(post_link, method='POST', data=item, dont_filter=True,
                        callback=self.post_success, errback=self.parse_fail)
Ejemplo n.º 11
0
 def start_requests(self):
     # get cat data
     if not hasattr(self, 'bidDatas') or not self.bidDatas:
         self.page += 1
         url = f'http://crawler.wemarry.vn/api/get-detail-multi?id={self.params}&page={self.page}'
         yield JsonRequest(url, callback=self.parse_data, dont_filter=True)
     # create requests
     if getattr(self, 'bidDatas', None):
         data = self.bidDatas.pop()
         item = obj(
             id_web=data.get('ID_WEB'),
             id=data.get('ID'),
             link=data.get('LINK'),
             post_link=data.get('POST_LINK'),
             arr_law = data.get('ARR_LAW'),
             # hotel_city_id=None, # ha noi/ da nang/ ho chi minh
             # hotel_search_image='',
             # # detail hotel
             # hotel_source=None, # Agoda,...
             # hotel_type=None, # str nha nghi/ khach san, ...
             # hotel_name=None, # str Hotel name
             # hotel_star=0, # int Hotel rating
             # hotel_address=None, # str Hotel address
             # hotel_description=None, # str Hotel description
             # hotel_image=None, # list of Hotel image
             # hotel_attribute=None, # list() Hotel attribute
             # hotel_latlng=None, # str 'lat,lon'
             # hotel_price=None, # list of room info: { name, price, guest, attribute }
             # # review
             # hotel_review=None # list of reviews: { name, image, rating, title, content }
         )
         for request in self.create_request(item):
             yield request
Ejemplo n.º 12
0
    def parse(self, response):
        company_name = ""
        for gpu in response.xpath('//*[@class="processors"]//tr'):

            current_company_name = gpu.xpath(
                '*[@class="mfgr"]//text()').extract_first()
            if current_company_name == "AMD":
                company_name = current_company_name

            if current_company_name == "Intel":
                company_name = current_company_name

            if current_company_name == "NVIDIA":
                company_name = current_company_name

            data = {
                'company': company_name,
                'product_name': gpu.xpath('td[1]//a//text()').extract_first(),
                'gpu_chip': gpu.xpath('td[2]//a//text()').extract_first(),
                'release_date': gpu.xpath('td[3]//text()').extract_first(),
                'bus': gpu.xpath('td[4]//text()').extract_first(),
                'memory': gpu.xpath('td[5]//text()').extract_first(),
                'gpu_clock': gpu.xpath('td[6]//text()').extract_first(),
                'memory_clock': gpu.xpath('td[7]//text()').extract_first(),
            }

            yield JsonRequest(url='http://localhost/api/gpu/add',
                              headers={'X-AUTH-TOKEN': apikey},
                              data=data)
Ejemplo n.º 13
0
    def parse(self, response):
        company_name = ""
        for cpu in response.xpath('//*[@class="processors"]//tr'):

            current_company_name = cpu.xpath(
                '*[@class="mfgr"]//text()').extract_first()
            if current_company_name == "AMD":
                company_name = current_company_name

            if current_company_name == "Intel":
                company_name = current_company_name

            data = {
                'company': company_name,
                'product_name': cpu.xpath('td[1]//a//text()').extract_first(),
                'code_name': cpu.xpath('td[2]//text()').extract_first(),
                'cores': cpu.xpath('td[3]//text()').extract_first(),
                'clock': cpu.xpath('td[4]//text()').extract_first(),
                'socket': cpu.xpath('td[5]//text()').extract_first(),
                'process': cpu.xpath('td[6]//text()').extract_first(),
                'l3_cache': cpu.xpath('td[7]//text()').extract_first(),
                'tdp': cpu.xpath('td[8]//text()').extract_first(),
                'released': cpu.xpath('td[9]//text()').extract_first(),
            }

            yield JsonRequest(url='http://localhost/api/cpu/add',
                              headers={'X-AUTH-TOKEN': apikey},
                              data=data)
    def parse(self, response):

        for debate in response.xpath(
                "/html/body/div[2]/div[4]/div/div/div/div[4]/div[1]/ul/li"):
            # yes ke liye
            title_1 = debate.xpath('.//@href').extract()
            list_A.append(title_1)
            desc_yes = debate.xpath('p/text()').extract()
            list_B.append(desc_yes)

        print(len(list_A), len(list_B))

        print("---------------")

        # for debate1 in response.xpath("/html/body/div[2]/div[4]/div/div/div/div[4]/div[2]/ul/li"):
        #     # No ke liye
        #     title2 = debate1.xpath('.//@href').extract()
        #     list_C.append(title2)
        #     desc_no = debate1.xpath('p/text()').extract()
        #     list_D.append(desc_no)
        #     title = debate1.xpath('a/text()').extract()
        #     list_E.append(title)
        #     # print (len(title2), len(desc_no), len(title))

        # print (len(list_C), len(list_D), len(list_E))

        # print("---------------*****")

        # for debate1 in zip(title_1,desc_yes,desc_no):
        for i in range(2):
            # print("Hey ya")
            scraped_info = {
                'title1': list_A[i],
                'desc_yes': list_B[i]
                # 'desc_no' : list_D[i]
            }

            yield scraped_info

        # next_page= response.xpath("/html/body/div[2]/div[4]/div/div/div/div[5]/a/@href").extract_first()
        # print(next_page)
        # if next_page is not None:
        #     print("kaise ho beta")
        #     next_page_link= response.urljoin(next_page)
        #     yield scrapy. Request(url=next_page_link, callback=self.parse)

        # if response.xpath("//a/@rel='next'\").get() == '1'):
        #     print("next page mkj")
        #     next_page = response.xpath('//a[@rel='next']/@href').get()
        #     yield response.follow(url=next_page , callback=self.parse)

        data = {
            # 'debateId': 'DF5F0C8D-BDA6-4C05-9C50-07FCD527D8BE',
            'page': '5',
        }
        yield JsonRequest(
            url=
            'https://www.debate.org/opinions/do-you-agree-with-the-black-lives-matter-movement-1',
            data=data)
Ejemplo n.º 15
0
 def test_POST_small_json_x10(self):
     request = JsonRequest(url=self.get_url('/post-data-json-small'), method='POST', data=Data.JSON_SMALL)
     return self._check_POST_json_x10(
         request,
         Data.JSON_SMALL,
         Data.EXTRA_SMALL,
         200
     )
Ejemplo n.º 16
0
 def test_POST_large_json_x10(self):
     request = JsonRequest(url=self.get_url('/post-data-json-large'), method='POST', data=Data.JSON_LARGE)
     return self._check_POST_json_x10(
         request,
         Data.JSON_LARGE,
         Data.EXTRA_LARGE,
         200
     )
Ejemplo n.º 17
0
 def start_requests(self):
     params = self.post_params.copy()
     yield JsonRequest(url=self.url,
                       data=params,
                       callback=self.parse_results_list,
                       meta={
                           'dont_obey_robotstxt': True,
                           'from': 0
                       })
Ejemplo n.º 18
0
 def start_requests(self):
     for page in range(1000):
         self.data['pageNo'] = page
         temp = JsonRequest(url=self.api_url,
                            headers=self.headers,
                            data=self.data,
                            callback=self.parse,
                            dont_filter=True)
         yield temp
Ejemplo n.º 19
0
    def parse(self, response):
        for hospital in response.css('table#dataList tbody tr'):
            data = {
                'contact-text':
                self.removeWhiteSpace(
                    self.trySafeIndexAccess(
                        hospital.xpath('./td[2]/text()').extract(), 0)),
                'contact-website':
                self.trySafeIndexAccess(
                    hospital.xpath('./td[2]//a/@href').extract(), 0),
                'region-abbreviation':
                self.translateBundesland(
                    self.removeWhiteSpace(
                        hospital.xpath('./td[3]/text()').extract()[0])),
                'icu-low-care':
                self.translateStatus(
                    hospital.xpath('./td[4]//span/@class').extract()[0]),
                'icu-high-care':
                self.translateStatus(
                    hospital.xpath('./td[5]//span/@class').extract()[0]),
                'ecmo':
                self.translateStatus(
                    hospital.xpath('./td[6]//span/@class').extract()[0]),
                'date-of-information':
                self.removeWhiteSpace(
                    hospital.xpath('./td[7]/text()').extract()[0]),
                'time-of-information':
                self.removeWhiteSpace(
                    hospital.xpath('./td[7]/text()').extract()[1]),
            }

            data['hospital-name'] = self.removeWhiteSpace(
                hospital.xpath('./td[1]/text()').extract()[0])

            additional_hopsital_data = hospital.xpath(
                './td[1]/small/text()').extract()
            if len(additional_hopsital_data) == 2:
                data['hospital-street'] = self.removeWhiteSpace(
                    additional_hopsital_data[0])
                splitted_data = additional_hopsital_data[1].split()
                data['hospital-postalcode'] = splitted_data.pop(0)
                data['hospital-city'] = " ".join(splitted_data)

            if len(additional_hopsital_data) == 3:
                data['hospital-department'] = additional_hopsital_data[0]
                data['hospital-street'] = self.removeWhiteSpace(
                    additional_hopsital_data[1])
                splitted_data = additional_hopsital_data[2].split()
                data['hospital-postalcode'] = splitted_data.pop(0)
                data['hospital-city'] = " ".join(splitted_data)

            self.query = f'{self.basic_query}&inputtype=textquery&input={data["hospital-street"]} {data["hospital-postalcode"]} {data["hospital-city"]}'
            yield JsonRequest(
                url=
                f'https://maps.googleapis.com/maps/api/place/findplacefromtext/json?{self.query}',
                cb_kwargs=dict(hospital=data),
                callback=self.parseGoogleResponse)
 def search(self, offset=0):
     return JsonRequest(
         url=self.buildUrl(offset),
         data={"criterias": [{
             "property": "ngsearchword",
             "values": [""]
         }]},
         callback=self.parse,
     )
Ejemplo n.º 21
0
 def parse(self, response):
     id_api = int(response.css('div::attr(data-good-link)').get())
     data = {
         "imtId": id_api,
         "take": self.count_comment,
         "order": "dateDesc"
     }
     yield JsonRequest(
         url='https://public-feedbacks.wildberries.ru/api/v1/feedbacks/site',
         data=data,
         callback=self.parse_my_url)
Ejemplo n.º 22
0
 def login_request(username, password):
     """Build login request."""
     return JsonRequest(
         'https://api.makeupalley.com/api/v1/users/auth/login',
         headers={'Referer': 'https://www.makeupalley.com/'},
         data={
             'userName': username,
             'password': password,
             'rememberMe': True,
             'fromSavedCookie': False
         },
         meta={'handle_httpstatus_list': [401]})
Ejemplo n.º 23
0
 def parse(self, response):
     print('\n>>> Get html from URL: %s' % response.url)
     # print(response.text)
     self.token = response.xpath('//input[@id="tokenvalue"]/@value').get()
     print('token: %s' % self.token)
     return [JsonRequest(
         url='http://www.ttsucha.com/api/ttscapi/noTosearch',
         method = 'POST',
         headers = self.custom_headers,
         data = { 'trackingNo': self.num, 'token': self.token },
         callback = self.parse_trackinginfo
     )]
Ejemplo n.º 24
0
    def get_num_papers(self, response):
        meta = response.meta
        data = json.loads(response.body)
        num_papers = data['hits']['total']
        num_iterations = num_papers - (num_papers % 10
                                       )  # at most 10 papers per page

        for iteration in range(0, num_iterations + 1, 10):
            self.post_params['from'] = iteration
            yield JsonRequest(url=self.url,
                              data=self.post_params,
                              callback=self.parse_query_result,
                              meta=meta,
                              dont_filter=True)
Ejemplo n.º 25
0
    def parse_results_list(self, response):
        data = json.loads(response.body)
        last_time = datetime.now()
        has_new_paper = False

        for item in data['hits']['hits']:
            item = item['_source']
            item.update({
                'date': parse_date(item['date']),
                'date_created': parse_date(item['date_created']),
                'date_modified': parse_date(item['date_modified']),
                'date_published': parse_date(item['date_published']),
                'date_updated': parse_date(item['date_updated']),
            })
            last_time = min(last_time, item['date_updated'])

            try:
                doi = None
                for x in item['identifiers']:
                    m = re.match(r'^https?://(?:dx\.)?doi.org/(.*)$', x)
                    if m:
                        doi = m.group(1)
                        break
                if not doi:
                    raise StopIteration
            except StopIteration:
                break

            item['doi'] = doi
            item['osf_id'] = item['id']
            del item['id']
            if self.has_duplicate(where='Scraper_osf_org', query={'doi': doi}):
                continue

            has_new_paper = True
            self.save_article(item,
                              to='Scraper_osf_org',
                              push_lowercase_to_meta=False)

        if has_new_paper and last_time > datetime(year=2020, month=1, day=1):
            params = self.post_params.copy()
            params['from'] = response.meta['from'] + len(data['hits']['hits'])
            yield JsonRequest(url=self.url,
                              data=params,
                              callback=self.parse_results_list,
                              meta={
                                  'dont_obey_robotstxt': True,
                                  'from': params['from']
                              })
Ejemplo n.º 26
0
    def parse(self, response):
        item = response.meta.get('item')

        self.zoneid = response.css('#hdZoneId::attr(value)').extract_first()
        self.excluid = response.css('#hdExcluId::attr(value)').extract_first()

        url = self.url.format(page=self.page,
                              zoneid=self.zoneid,
                              excluid=self.excluid)
        yield JsonRequest(url,
                          headers=self.headers,
                          callback=self.parse_api,
                          errback=self.fail,
                          dont_filter=True,
                          meta={'item': item})
Ejemplo n.º 27
0
 def parse(self, response):
     endFlag='0'
     body = json.loads(response.text)
     for each in body['data']['records']:
         item = GovinvestHunanItem()
         investDict = {}
         if not 'approvalDate' in each.keys():
             continue
         createTime = each['createTime']
         recordDate = datetime.strptime(createTime, "%Y-%m-%d")
         print(recordDate)
         currDate = datetime.strptime(datetime.now().strftime("%Y-%m-%d"), "%Y-%m-%d")
         #print(currDate)
         yesterday = datetime.strptime((datetime.today()+ timedelta(-1)).strftime("%Y-%m-%d"), "%Y-%m-%d")
         #print(yesterday)
         if currDate == recordDate:
             print('currDate == recordDate')
             continue 
         if yesterday > recordDate:
             print('yesterday > recordDate')
             endFlag='1'
             continue 
             
         pid  = each['id']
         projectName = each['prjName']   #项目名称
         projectCode = each['projectCode']  #项目代码
         approvalNum = each['approvalNum']  #批复文号
         fileGuid = each['fileGuid']  #文件id
         approvalDepartName = each['approvalDepartName']  #审批单位
         approvalDate = each['approvalDate']  #审批单位
         
         investDict[u'发布日期'] = createTime   #发布日期
         investDict[u'批复时间'] = approvalDate   #批复时间
         investDict[u'项目名称'] = projectName   #项目名称
         investDict[u'项目代码'] = projectCode   #项目代码
         investDict[u'批复文号'] = approvalNum   #批复文号
         investDict[u'审批单位'] = approvalDepartName   #审批单位
         investDict[u'id'] = pid  #项目id
         investDict[u'附件地址'] = self.downloadLink.format(fileGuid=fileGuid)  #附件地址
         
         item['dic']=investDict
         yield item
         
     self.count +=1     
     if self.count<100 and endFlag=='0':
         print ('go next page ------------------------------'+str(self.count))
         self.packet['pageIndex'] = self.count
         yield JsonRequest(self.start_urls[0], data=self.packet, callback=self.parse)
Ejemplo n.º 28
0
 def parse_item(self,
                response,
                domain,
                job_id,
                crawl_variations,
                lat='43.769037',
                lng='-79.371951'):
     self._referer_for_jsonrequest = response.request.url
     self._domain = domain
     self._job_id = job_id
     self._sku = utils.extract_sku_from_url(response.url, self._domain)
     if not self._sku:
         self.logger.exception("[{}][null] Request ignored - no SKU".format(
             self._domain))
         raise IgnoreRequest
     if response.status != 200:
         # broken link or inactive item
         yield self.build_listing_item(response)
     else:
         _data = self.__get_preloaded_data_components(response)
         _meta_data = self.__extract_meta_data(response)
         self._parent_sku = _data.get('SkuSelectors',
                                      {}).get('pCode',
                                              '{}P'.format(self._sku))
         if crawl_variations:
             _skus = list(
                 _data.get('SkuSelectors', {}).get('skuListProperties',
                                                   {}).keys())
         else:
             _skus = [self._sku]
         yield self.build_listing_item(response,
                                       data=_data,
                                       meta_data=_meta_data)
         yield JsonRequest(
             settings.CANADIANTIRE_CA_API_STORES_LINK_FORMAT.format(
                 lat=lat, lng=lng, pid=self._parent_sku),
             callback=self.parse_near_stores,
             errback=parsers.resp_error_handler,
             meta={
                 # avoid error - Crawled (503) <GET https://api-triangle.canadiantire.ca/robots.txt>
                 'dont_obey_robotstxt': True,
             },
             headers={
                 'Referer': self._referer_for_jsonrequest,
             },
             cb_kwargs={
                 'skus': _skus,
             })
Ejemplo n.º 29
0
 def start_requests(self):
     print('>>> post request.')
     return [
         JsonRequest(url="https://t.17track.net/restapi/track",
                     method='POST',
                     headers=self.custom_headers,
                     data={
                         'data': [{
                             'num': self.num,
                             'fc': 0,
                             'sc': 0
                         }],
                         'guid': '',
                         'timeZoneOffset': -480
                     },
                     callback=self.after_post)
     ]
Ejemplo n.º 30
0
    def parse(self, response):
        json_response = json.loads(response.body_as_unicode())

        self.reqs_number += 1

        # dealing with this HeadlinesResponse that is a placeholder for lists
        for json_list in json_response['HeadlinesResponse']:
            # the summary object is one to check to see if there is any more
            # response beyond a given date
            if 'Summary' in json_list:
                oldest_date_obj = min(
                    json_list['Summary'],
                    key=lambda x: x['CreateTimestamp']['Value'])

                doc_id = oldest_date_obj['DocumentIdUri'].split('/')[-1]
                oldest_res_date = oldest_date_obj['CreateTimestamp']['Value']

                yield {
                    'HeadlinesResponse': json_response['HeadlinesResponse'],
                }

                # condition for stopping is if max_requests was or max end time was reached
                can_stop = (self.max_requests
                            and self.reqs_number >= self.max_requests) or (
                                self.end_time
                                and oldest_res_date <= self.end_time)

                if not can_stop:
                    # updating params for next request
                    query_params = self.start_query_params.copy()
                    query_params['datetime'] = oldest_res_date
                    query_params['direction'] = 'older'

                    # This param seems to be useless. But the official website request uses it.
                    query_params['docid'] = doc_id

                    yield JsonRequest(url=response.url,
                                      data=query_params,
                                      dont_filter=True,
                                      callback=self.parse)
                # start new stock
                else:
                    yield from self.start_new_stock(response)
            else:
                yield from self.start_new_stock(response)