コード例 #1
0
 def next_page(self,response,now_page,total_page,__VIEWSTATE,__EVENTTARGET):
     if int(total_page) > int(now_page):
     #if int(now_page) < 6:
         formdata_n = {
             '__EVENTTARGET': __EVENTTARGET,
             '__VIEWSTATE': __VIEWSTATE,
             'FManageDeptID': '-1',
             'FLevel': '0',
             'FIsWright': '-1'
         }
         return FormRequest(response.url, formdata=formdata_n, callback=self.parse)
コード例 #2
0
 def start_requests(self):
     for i in self.keywords.keys():
         for j in range(self.keywords[i]):
             self.payload['search'] = i
             self.payload['page'] = str(j)
             url = 'https://www.laosiji.com/proxy/api'
             yield FormRequest(url=url,
                               callback=self.parse,
                               formdata=self.payload,
                               headers=self.headers,
                               dont_filter=True)
コード例 #3
0
ファイル: lagou.py プロジェクト: MarphyDemon/Crawler
    def start_requests(self):

        yield FormRequest(self.url,
                          headers=self.headers,
                          formdata={
                              'first': 'false',
                              'pn': str(self.page),
                              'kd': 'Python',
                              'city': '广州'
                          },
                          callback=self.parse)
コード例 #4
0
    def start_requests(self):
        with open('../out/docs_simple4.json', 'r') as f:
            for line in f.readlines():
                doc = json.loads(line)
                if not os.path.isfile(get_path(doc)):
                    if random.random() > 0.07:
                        continue

                    yield FormRequest('http://ras.arbitr.ru/Ras/HtmlDocument/%s' % doc['doc_id'],
                                      formdata={'hilightText': 'null'},
                                      meta=doc, headers={'User-Agent': 'Wget/1.19.4 (linux-gnu)'})
コード例 #5
0
 def start_requests(self):
     # 最大页码
     # MAX_PAGE_COUNT = 64;
     MAX_PAGE_COUNT = 1
     for page in range(0, MAX_PAGE_COUNT):
         url = list_origin_url % (page * 60)
         yield FormRequest(url,
                           meta={'cookiejar': str(page)},
                           headers=headers,
                           cookies=cookies,
                           callback=self.parse)
コード例 #6
0
 def run_basket_parse(self, response):
     formdata = {
         'dlw100$Update$1': '1',
         'ddlDeliveryCC': '36',
         'dlw$MatrixID': '1',
         '__EVENTTARGET': 'dlw100$DeliveryUpdate'
     }
     yield FormRequest(
         'http://www.prodirectsoccer.com/V3_1/V3_1_Basket.aspx',
         self.parse_shipping,
         formdata=formdata)
コード例 #7
0
 def _parse_hg_mid(self, response):
     mid_categories = response.xpath(".//*[@class='yahei f14 rgt mr20']")
     for mid_category in mid_categories:
         page_url = mid_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url,
                               callback=self._parse_first,
                               dont_filter=True)
         request.meta["large_category"] = response.meta["large_category"]
         request.meta["callback"] = self._parse_hg
         yield request
コード例 #8
0
 def start_requests(self):
     data = {
         'start': '0',
         'length': '6',
         'pageLength': '6',
         '_order': '1:b'
     }
     yield FormRequest(url=self.start_url,
                       formdata=data,
                       callback=self.dataset_list_parse,
                       meta={'data': data})
コード例 #9
0
 def _parse_page_free(self, response):
     total_pages = int(
         clean_text(
             response.xpath(".//*[@class='pages']//a//text()").extract()
             [-2].strip()))
     first_url = response.meta["first_url"]
     request = FormRequest(first_url,
                           callback=self._parse_free,
                           dont_filter=True)
     request.meta["large_category"] = response.meta["large_category"]
     yield request
     if total_pages > 1:
         for i in xrange(1, total_pages):
             next_page = first_url[:-5] + '-p' + str(i + 1) + '.html'
             request = FormRequest(next_page,
                                   callback=self._parse_free,
                                   dont_filter=True)
             request.meta["large_category"] = response.meta[
                 "large_category"]
             yield request
コード例 #10
0
 def start_requests(self):
     yield FormRequest(
         url=
         'http://www.sanxianginvest.com/api.php?c=login&f=save&_noCache=0.8049219487167945',
         headers={'Referer': 'http://www.sanxianginvest.com'},
         formdata={
             'post_date': '2025-05-08 16:00:43',
             'pdip': '203.110.179.245',
             'user': self.username,
             'pass': self.password
         })
コード例 #11
0
ファイル: ZhongyuInvset.py プロジェクト: qianbin0205/ggscrap
 def parse_pre_login(self, response):
     yield FormRequest(url='http://www.shcfic.com/user.php?act=act_login',
                       formdata={'mobile': '13523794375',
                                 'name': '123456',
                                 'utf8': '✓',
                                 'back_act': 'http://www.shcfic.com/index.php'
                                 },
                       meta={
                           'handle_httpstatus_list': [302],
                       },
                       callback=self.parse_login)
コード例 #12
0
 def parse(self, response):
     areas = response.xpath('//*[@id="selArea"]/optgroup/option')
     for area in areas:
         formdata = {
             "areaID": area.xpath('.//@value').extract_first(),
             "areaNombre": area.xpath('.//text()').extract_first()
         }
         yield FormRequest(URL_API_MATERIAS,
                           formdata=formdata,
                           callback=self.parse_areas,
                           meta={'formdata': formdata})
コード例 #13
0
 def parse_link(self, response):
     id = response.meta['id']
     newsType = response.meta['newsType']
     url = 'http://xyxx.zjfda.gov.cn/ajax/ajax!detail_cjbhg_sp.do'
     post_request = FormRequest(url=url,
                                formdata={
                                    'queryBean.id': '%s' % id,
                                    'queryBean.newsType': '%s' % newsType
                                },
                                callback=self.parse_page)
     yield post_request
コード例 #14
0
 def start_requests(self):
     # Get the year to be crawled from the arguments
     # The year is passed like this: scrapy crawl gazettes -a year=2017
     # Default to current year if year not passed in
     try:
         year = self.year
     except AttributeError:
         year = datetime.now().strftime('%Y')
     url = 'https://dds.crl.edu/item/json'
     form_data = {'year': str(year), 'TitleLink': str(27040)}
     yield FormRequest(url, callback=self.parse, formdata=form_data)
コード例 #15
0
 def start_requests(self):
     keyword = '000001'
     url = '{url}?keyword={keyword}'.format(url=self.search_url,
                                            keyword=keyword)
     # for page in range(self.max_page + 1):
     for page in range(1):
         data = {'mp': str(self.max_page), 'page': str(page)}
         yield FormRequest(url=url,
                           callback=self.parse_index,
                           formdata=data,
                           dont_filter=True)
コード例 #16
0
 def parse_pre_login(self, response):
     yield FormRequest(url='http://www.sz-sgd.com/User.ashx',
                       formdata={'username': '******',
                                 'password': '******',
                                 'r':'0.9917246479356379',
                                 'type': 'login',
                                 },
                       meta={
                           'handle_httpstatus_list': [302],
                       },
                       callback=self.parse_login)
コード例 #17
0
ファイル: alibaba.py プロジェクト: Losmli010/CrawlSpiders
 def start_requests(self):
     post_url = 'https://job.alibaba.com/zhaopin/socialPositionList/doList.json'
     for i in range(1, 804):
         data = {
             'pageSize': '10',
             't': '0.9258839192303483',
             'pageIndex': '%d' % i
         }
         yield FormRequest(url=post_url,
                           formdata=data,
                           callback=self.parse_json)
コード例 #18
0
 def start_requests(self):
     city = '北京'
     needAddtionalResult = 'false'
     url = self.url.format(city=city,
                           needAddtionalResult=needAddtionalResult)
     first = 'true'
     for page in range(self.max_pn + 1):
         data = {'first': first, 'kd': self.kd, 'pn': str(page)}
         print(FormRequest(url, formdata=data))
         # yield FormRequest(url, callback=self.parse_info, formdata=data)
         first = 'false'
コード例 #19
0
ファイル: search.py プロジェクト: Umi101108/Spider
 def start_requests(self):
     for keyword in self.keywords:
         url = '{url}?keyword={keyword}'.format(url=self.search_url,
                                                keyword=keyword)
         for page in range(self.max_page + 1):
             url += '&page={page}'.format(page=str(page))
             data = {
                 'keyword': keyword,
                 'page': str(page),
             }
             yield FormRequest(url, callback=self.parse, formdata=data)
コード例 #20
0
    def start_requests(self):
        end_date = date.today()

        periods_of_interest = [(date.year, date.month) for date in rrule(
            freq=MONTHLY, dtstart=self.start_date, until=end_date)]
        for year, month in periods_of_interest:
            data = dict(ano=str(year), mes=str(month), passo="1", enviar="")
            yield FormRequest(
                "http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
                formdata=data,
            )
コード例 #21
0
 def start_requests(self):
     url = "https://space.bilibili.com/ajax/member/GetInfo"
     data = {
         "mid":"",
         "csrf":"578fcf3bca4387ff58c87f70040785e8",
     }
     for mid in range(1,10000):
         if self.is_pause:
             break
         data["mid"] = str(mid)
         yield FormRequest(url=url,formdata=data,callback=self.parse)
コード例 #22
0
 def parse(self, response):
     # total_page = dict(response.xpath("//a[@sf='pagebar']/@*[name()='sf:data']").extract_first().strip("()")).get("pc")
     total_page = int(re.findall(".*?pc:(\d+).*", response.xpath("//a[@sf='pagebar']/@*[name()='sf:data']").extract_first())[0])
     for page in range(1, total_page+1):
     #for page in range(1, 50):
         formdata = {
             '$total': str(total_page),
             '$reload': '0',
             '$pg': str(page),
             '$pgsz': '15'
         }
         yield FormRequest(response.url, formdata=formdata, callback=self.parse_companylist)
コード例 #23
0
 def start_requests(self):
     mformat = 'csi%Y%m%d.zip'
     end_date = datetime.now().strftime(mformat)
     start_date = self.get_nday_ago(end_date, 10, dformat=mformat)
     while start_date <= end_date:
         furl = self.start_url + start_date
         yield FormRequest(url=furl,
                           method='GET',
                           callback=self.parse,
                           errback=self.errback_httpbin)
         start_date = self.get_tomorrow_date(sdate=start_date,
                                             dformat=mformat)
コード例 #24
0
    def start_requests(self):
        workbook = xlrd.open_workbook(r'C:\\Users\\99329\\Desktop\\待测试数据.xls')

        for i in self.open_file():
            s = i.strip()
            #s = 'http://upload.51qianmai.com/20180126064925821.jpg'
            data = {'channel': 'abc', 'picturl': s}
            #data = {'channel':'abc','picturl':i}
            yield FormRequest(url=self.start_url,
                              formdata=data,
                              callback=self.parse,
                              meta={'url': s})
コード例 #25
0
    def parse_topic(self, response):  #得到各个子话题的网址
        # 获取传递的变量
        offset = response.meta.get("offset")
        topic_id = response.meta.get("topic_id")
        topic_name = response.meta.get("name")
        # 解析获得的响应
        json_info = json.loads(response.text)  # 此时json_info为一个字典
        msg_info = json_info['msg']  # 键为 msg 的值对应为一个列表
        offset += len(msg_info)

        date = {"topic": topic_name}

        # 判断 msg_info 里的 msg 数量是否小于20,小于的话表示已经是最后一页,就不再请求了
        for x in msg_info:
            child_id = re.search(r'\/topic\/(\d+)', x).group()
            id = re.search(r'(\d+)', child_id).group()  # 为下面的请求传所需要的参数
            url1 = 'https://www.zhihu.com/api/v4/topics/'  # 此处是topics而不是topic
            url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0'
            #url2 = '/feeds/top_question?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=10'
            #url2 = '/feeds/top_activity?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=5'
            url = url1 + id + url2

            yield Request(
                url=url,
                callback=self.parseQuestions,
                meta=date,
                dont_filter=True,
            )

        if not len(msg_info) < 20:
            yield FormRequest("https://www.zhihu.com/node/TopicsPlazzaListV2",
                              callback=self.parse_topic,
                              dont_filter=True,
                              meta={
                                  "offset": offset,
                                  "topic_id": topic_id,
                                  "name": topic_name
                              },
                              formdata={
                                  "method":
                                  "next",
                                  "_xsrf":
                                  "anaUqgXhz0GbjNTjnykooNIwJJuQz0CY",
                                  "params":
                                  json.dumps({
                                      "topic_id":
                                      topic_id,
                                      "offset":
                                      offset,
                                      "hash_id":
                                      "5d6d053d9cca5b5d463f76e7f866080a"
                                  })
                              })
コード例 #26
0
 def start_requests(self):
     for i in range(1, self.endPageNum):
         form_data = {
             "VENUS_PAGE_NO_KEY_INPUT": str(i),
             "VENUS_PAGE_NO_KEY": str(i),
             # "VENUS_PAGE_COUNT_KEY": "2633",
             "VENUS_PAGE_SIZE_KEY": "15",
         }
         request = FormRequest(self.tmpl_url,
                               callback=self.parse_page,
                               formdata=form_data)
         yield request
コード例 #27
0
ファイル: wiggle.py プロジェクト: oceancloud82/scraping
 def get_currency(self, response):
     verification_token = response.xpath('//input[@name="__RequestVerificationToken"]/@value').extract()[0]
     yield FormRequest('http://www.wiggle.fr/internationaloptions/update',
                       formdata={'__RequestVerificationToken': verification_token,
                                 'langId': self._lang_form_lang_id,
                                 'currencyId': self._lang_form_currencyID,
                                 'countryId': self._lang_form_countryID,
                                 'action': 'Update',
                                 'returnUrl': '/',
                                 'cancelUrl': '/'},
                       dont_filter=True,
                       callback=self.init_requests)
コード例 #28
0
 def parse(self, response):
     if self.sort_mode and not response.meta.get('sort_forced', False):
         formdata = {"ctl00$content$ddlProductListSort": self.sort_mode,
                     "__EVENTTARGET": "ctl00$content$ddlProductListSort"}
         self._post_set_viewstate(formdata, response)
         meta = response.meta.copy()
         meta['sort_forced'] = True
         yield FormRequest(response.url, formdata=formdata, meta=meta,
                           dont_filter=True)
     else:
         for item in super(BhinnekaProductsSpider, self).parse(response):
             yield item
コード例 #29
0
    def start_requests(self):
        """Simulate login action by sending http post request to get the cookies.

        Args:

        Yields:
            scrapy.FormRequest:

        """
        yield FormRequest(url=self.start_urls[0],
                          formdata={'autpor': '57'},
                          callback=self.parse_main_page)
コード例 #30
0
ファイル: xiaomi.py プロジェクト: MyLoveES/12306-scrapy
 def umatk(self, response):
     jdata = json.loads(response.body.decode())
     if jdata['result_code'] == 0:
         print('umatk SUCCESS')
         formdata = {
             'tk': jdata['newapptk']
         }
         url = 'https://kyfw.12306.cn/otn/uamauthclient'
         print('uamtkclient START')
         return FormRequest(url, formdata=formdata, meta={'cookiejar': self.cookiejar}, callback=self.umatkauthclient)
     else:
         print('umatk FAILED,请重试')