def parse_from(self, response):
        selector = Selector(response)
        stop_ids = selector.css('#confirm1_ddlTravellingTo option[value!="-1"]::attr(value)').extract()

        for stop_id in stop_ids:
            request = FormRequest.from_response(response,
                            formname='ctl01',
                            formdata={
                                'confirm1$ddlLeavingFromMap': response.meta['stop_from'],
                                'confirm1$ddlTravellingTo': stop_id,
                                'confirm1$btnSearch': 'Search',
                            },
                            callback=self.parse_to_stop)
            request.meta['stop_to'] = stop_id
            request.meta['stop_from'] = response.meta['stop_from']
            yield request

        # reapeat to get the first stop (from)
        request = FormRequest.from_response(response,
                        formname='ctl01',
                        formdata={
                            'confirm1$ddlLeavingFromMap': response.meta['stop_from'],
                            'confirm1$ddlTravellingTo': stop_id,
                            'confirm1$btnSearch': 'Search',
                        },
                        callback=self.parse_from_stop)
        request.meta['stop_to'] = stop_id
        request.meta['stop_from'] = response.meta['stop_from']
        yield request
  def listing_parser(self, response):
    """
    Given a paginated listing of alumni, parse members and reparse individual page and rerun on next
    pages
    """
    # Get all pages
    x = HtmlXPathSelector(response)

    # if has numbered responses
    pages_hrefs = [i.extract().split("'")[1] for i in x.select("//div[contains(@class,'rgNumPart')]/a/@href")]
    pages = x.select("//div[contains(@class,'rgNumPart')]/a")

    requests = []

    # if the first link is a previous pages, delete it
    if "Previous Pages" in pages[0].extract():
      del pages_hrefs[0]

    # if next pages, pop and rerun listing_parser on it
    if "Next Pages" in pages[-1].extract():
      requests.append(FormRequest.from_response(response,
          formdata = {'__EVENTTARGET' : pages_hrefs.pop(), },
          callback=self.listing_parser))

    # parse each page including the first with scraper
    for href in pages_hrefs:
      requests.append(FormRequest.from_response(response,
          formdata = {'__EVENTTARGET' : href, },
          callback=self.listing_scraper))

    return requests
    def parse_study(self,response):               
        study = response.meta['study']
        areaId = response.meta['areaId']
        
        sel_options = response.xpath('//select[@name="disciplineId"]/option')
        study_desc = response.xpath('//p[@class="tablehead"]/following::p/text()')[0].extract()

        discipline_form_requests = []

        item = StudyItem()
        item['name'] = study
        item['description'] = study_desc

        for sel in sel_options[1::]: 
            discipline = sel.xpath('text()').extract()[0]
            disciplineId = sel.xpath('@value').extract()[0]
            discipline_form_request = FormRequest(
                url='http://www2.assist.org/exploring-majors/findDiscipline.do', 
                formdata={'areaId':str(areaId),
                          'disciplineId':str(disciplineId)},
                callback=self.parse_discipline)
            discipline_form_request.meta['study'] = study
            discipline_form_request.meta['discipline'] = discipline
            discipline_form_requests.append(discipline_form_request)

        return discipline_form_requests + [item]
Example #4
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = hxs.select("//div[@id='ProductDetails']/div[@class='BlockContent']")
        for item in items:
            title = item.select("h2/text()").extract()[0]
            url = response.url
            product_id = item.select(
                "div[@class='ProductMain']/div[@class='productAddToCartRight']/\
                 form[@id='productDetailsAddToCartForm']/input[@name='product_id']/@value").extract()[0]
            select_el = item.select(
                "div[@class='ProductMain']/div[@class='productAddToCartRight']/\
                 form[@id='productDetailsAddToCartForm']/div[@class='ProductDetailsGrid ProductAddToCart']/\
                 div[@class='productAttributeList']/div/\
                 div[@class='productAttributeValue']/div[@class='productOptionViewSelect']/select")
            field_name = select_el.select("@name").extract()[0]
            options = select_el.select('option')
            for option in options:
                option_name = option.select("text()").extract()[0]
                option_value = option.select("@value").extract()[0]
                if not option_value:
                    continue
                item_options = json_api_request_args.copy()
                item_options[field_name] = option_value
                item_options['product_id'] = product_id

                new_item_name = title + " " + option_name
                request = FormRequest(
                    url=json_api_url,
                    formdata=item_options,
                    callback=self._parse_item_json
                )
                request.meta['item_name'] = new_item_name
                request.meta['item_url'] = url
                request.meta['subtype_id'] = option_value
                yield request
 def build_form_request(self, search_term, formdata):
     form_request = FormRequest("https://searchwww.sec.gov/EDGARFSClient/jsp/EDGAR_MainAccess.jsp",
                                callback=self.parse_search_results_follow_next_page,
                                formdata=formdata)
     form_request.meta['search_term'] = search_term
     form_request.meta['page_num'] = 1
     return form_request
    def parse(self, response):
        # deleting files:
        try:
            if os.path.exists("newPost.txt"):
                os.remove("newPost.txt")
            if os.path.exists("newgetLinks.txt"):
                os.remove("newgetLinks.txt")
            if os.path.exists("scrappedurls.txt"):
                os.remove("scrappedurls.txt")
        except:
            pass
        # print "Status:",response.status
        # print "Request Headers"
        # print response.request.headers.items()
        # print "\n\n"
        # print "Response Headers"
        # print response.headers.items()
        # print "\n\n"

        login_user = self.credentials[response.request.url][0]
        print login_user
        login_pass = self.credentials[response.request.url][1]
        print login_pass
        args, url, method, name, number = fill_login_form(response.url, response.body, login_user, login_pass)

        if name:
            yield FormRequest.from_response(
                response, method=method, formdata=args, formname=name, callback=self.after_login
            )
        else:
            yield FormRequest.from_response(
                response, method=method, formdata=args, formnumber=number, callback=self.after_login
            )
    def parse(self, response):
      # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
       #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login)
       args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
       credentials = list()
       tmpparam = dict()
       print args
       for a in args:
        if a[0].find("user") > -1 or a[0].find("admin") > -1:
          tmpparam["userid"] = a[0]
        if a[0].find("password") > -1:
          tmpparam["passwordid"] = a[0]
       
     #  tmpparam["submit"] = "submit"
       tmpparam["url"] =  self.start_urls[0]
       tmpparam["login"] = ""
       credentials.append(tmpparam)
       f = open("json/credentials.json", 'w')
       f.write(json.dumps(credentials,indent= 4, sort_keys = True))
       f.close()


       if name:
                yield FormRequest.from_response(response, method=method, formdata=args, formname=name,  dont_filter=True,callback=self.after_login)        
       else:
                yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number,  dont_filter=True,callback=self.after_login)
Example #8
0
 def start_requests(self):
     scraperwiki.sql.execute('DROP TABLE IF EXISTS subsidaries')
     for i in range(1, 16000):
         uid = '91/{:05d}'.format(i)
         request = FormRequest('https://www.ird.gov.hk/charity/view_detail.php', formdata={'org_id':uid}, callback=self.parse_page)
         request.meta['uid'] = uid
         yield request
Example #9
0
    def parse_start_url(self, response):
        base_data = {
            'name': '',
            'country': '',
            '_state': '1',
            'language': '0',
            'qualification': '0--',
            'level': '0',
            'speciality': '0',
            '_isTrainer': 'on'
        }

        data = base_data.copy()
        data['profession'] = str(PROFESSION)
        form_request = FormRequest.from_response(response,
            formdata=data)
        return form_request

        requests = []
        for profession in PROFESSIONS:
            data = base_data.copy()
            data['profession'] = str(PROFESSION)
            form_request = FormRequest.from_response(response,
                formdata=data)
            requests.append(form_request)
        return requests
Example #10
0
    def parseVoteSearchResults(self, response):
        sel = Selector(response)
        
        validationstr = self.getValidationString(response)
        
        # Parse the first page of results
        for voteItem in self.parseVoteTableResults(response):
            yield voteItem
        
        pages = 1
        
        # Grab the vote table
        voteTable = sel.css('#SelectVoteHistoryView_GridView1')
        rows = voteTable.css('tr')
        
        # The last row contains the page links
        paginationRow = rows[-1]
        
        firstCellElement = paginationRow.css('td>span::text')
        if not firstCellElement:
            # Can't find the navigate bar??
            return
        firstCellContent = firstCellElement.extract()[0]
        
        # Check if there are any pages..
        if str(firstCellContent).isdigit():
            thisPage = int(firstCellContent)-1
            
            # The last cell contains some js to skip to the final page
            lastCell = paginationRow.css('td')[-1]
            lastPageElem = lastCell.css('a::attr(onclick)')
            if len(lastPageElem) != 1:
                # We're on the last set of pages, so there's no last page link
                return
            
            lastPageLink = lastPageElem[0].extract()
    
            # Need to pull page number out of this: javascript:__gvSelectVoteHistoryView_GridView1.callback("69|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|"); return false;
            # It's the first value in the param string
            # TODO: Error handling...
            pages = int(re.split('\"|\|',lastPageLink)[1]) + 1
            
            # Only iterate over pages if this is a new table
            if response.meta['fetch']:
                for i in range(thisPage,min([pages,thisPage+11])):
                    fetch = False
                    if i == thisPage + 10:
                        fetch = True

                    if validationstr:
                        # The fact we have a validationstr means we need a surrogate form (this is a callback)
                        form = response.meta['form']
                        yield FormRequest.from_response(form, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1',
                                                                                                       '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI=',
                                                                                                        '__EVENTVALIDATION':validationstr},meta={'fetch':fetch,'form':form})
                    else:
                        # Should only get here if this is first page
                        #inspect_response(response)
                        yield FormRequest.from_response(response, callback=self.parseVoteSearchResults, formdata={'__CALLBACKID':'SelectVoteHistoryView$GridView1',
                                                                                                       '__CALLBACKPARAM':str(i)+'|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=||' + str(thisPage) + '|0|/wFlpZg1Fw7NYQJzL0vuw+Y57ZPx1ug=|/wFk7SDVa18iduwa7ivhFHIM55t+AhI='},meta={'fetch':fetch,'form':response})
Example #11
0
    def search(self, response):
        # Fill in form (search field) with default term "data scientist" and search.

        data_form = None
        if 'data_form' in response.meta.keys():
            data_form = response.meta['data_form']
        else:
            data_form = self.get_input_data(self.input_file)

        if not data_form:
            return
        elif len(data_form) == 1:
            key, value = data_form.popitem()
            return FormRequest.from_response(response,
                        formdata={"newNtk": key,
                                  "newNtt": value},
                        callback=self.parse_results)
        else:
            key, value = data_form.popitem()
            form_request = FormRequest.from_response(response,
                    formdata={"newNtk": key,
                              "newNtt": value},
                    callback=self.search)
            form_request.meta['data_form'] = data_form
            return form_request
Example #12
0
 def parse(self, response):
     state_codes=['AZ','TX']
     for state in state_codes:
         request=FormRequest(url='http://nssf.org/retailers/find/index.cfm',
                             formdata={'txtState':state,'Submit':'Submit'},
                             callback=self.result_page)
         request.meta['state']=state
         yield request
 def parse(self, response):
   # args, url, method = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
    #return FormRequest(url, method=method, formdata=args, dont_filter=True,callback=self.after_login)
    args, url, method, name , number = fill_login_form(response.url, response.body, self.login_user, self.login_pass)
    if name:
             yield FormRequest.from_response(response, method=method, formdata=args, formname=name,  dont_filter=True,callback=self.after_login)        
    else:
             yield FormRequest.from_response(response, method=method, formdata=args, formnumber=number,  dont_filter=True,callback=self.after_login)
Example #14
0
File: Lou.py Project: tobinchen/Lou
    def parse(self,response):
        for n in range(2006,2016):
            for y in range(1,13):
                formdata={'nian': str(n), 'yue1': str(y),'yue2':str(y),'tijiao':'ч╗Я шоб'}
                req= FormRequest(url = 'http://www.qyfgj.cn/gz/ti_result.asp',
formdata = formdata,callback=self.parseData)
                req.meta['date']=datetime(n,y,1)
                yield req
Example #15
0
 def test_from_response_override_method(self):
     response = _buildresponse(
             '''<html><body>
             <form action="/app"></form>
             </body></html>''')
     request = FormRequest.from_response(response)
     self.assertEqual(request.method, 'GET')
     request = FormRequest.from_response(response, method='POST')
     self.assertEqual(request.method, 'POST')
Example #16
0
 def start_requests(self):       
     for i, id in enumerate(self.items):
         #if i > 9: break
         url = self.link_template + id
         request = FormRequest(url,
                           headers = self.headers,
                           cookies =self.cookies,
                           callback = self.parse_item)
         request.meta['id'] = id
         yield request
Example #17
0
 def test_from_response_override_method(self):
     response = _buildresponse(
         """<html><body>
             <form action="/app"></form>
             </body></html>"""
     )
     request = FormRequest.from_response(response)
     self.assertEqual(request.method, "GET")
     request = FormRequest.from_response(response, method="POST")
     self.assertEqual(request.method, "POST")
Example #18
0
 def __init__(self, url=None, callback=None, method=None, formdata=None,
              body=None, **kwargs):
     # First init FormRequest to get url, body and method
     if formdata:
         FormRequest.__init__(
             self, url=url, method=method, formdata=formdata)
         url, method, body = self.url, self.method, self.body
     # Then pass all other kwargs to SplashRequest
     SplashRequest.__init__(
         self, url=url, callback=callback, method=method, body=body,
         **kwargs)
Example #19
0
 def generate_new_category_request(self,index):
     if index < len(self.values):
         request = FormRequest(url=self.result_url.replace("results", "refine_results"),
             formdata={'profile[electricity_plan_type]' : str(self.values[index]),
             'profile[discounts][EP]' : '1',
             'profile[discounts][PP]' : '1'},
             callback=self.step_results,
             dont_filter=True)
         request.meta['next'] = index+1
         return request
     return None
Example #20
0
 def test_from_response_override_url(self):
     response = _buildresponse(
             '''<html><body>
             <form action="/app"></form>
             </body></html>''')
     request = FormRequest.from_response(response)
     self.assertEqual(request.url, 'http://example.com/app')
     request = FormRequest.from_response(response, url='http://foo.bar/absolute')
     self.assertEqual(request.url, 'http://foo.bar/absolute')
     request = FormRequest.from_response(response, url='/relative')
     self.assertEqual(request.url, 'http://example.com/relative')
Example #21
0
	def get_media_requests(self, item, info):
		if item['attach']:
			log.msg('-----++++++++++++++')
			if item['attach'][0]=='post':
				req = FormRequest(
                                                url=item['attach'][1],
                                                formdata=item['attach'][2]
                                        )
				req.meta['item']=item
				return [req]
			#	yield Request(link)
		return;
Example #22
0
    def parse_zhaopin(self, response):
        sel = Selector(response)
        regex_rule = r'http://www.lagou.com/zhaopin/(.*?)/'
        result = re.match(regex_rule, response.url)
        formdata = {'first' : 'false', 'pg': str(1)}
        if result:
            formdata['kd'] = result.group(1)

        request = FormRequest(url=self.json_url,
                           method='POST',
                           formdata=formdata,
                           callback=self.parse_first_json)
        request.meta['kd'] = formdata['kd']
        return request
 def parse_browse(self,response):         
     sel_options = response.xpath('//select[@name="areaId"]/option')
     study_form_requests = []        
     
     for sel in sel_options[1::]:
         study = sel.xpath('text()').extract()[0] 
         areaId = sel.xpath('@value').extract()[0]
         study_form_request = FormRequest(
             url='http://www2.assist.org/exploring-majors/findAreaOfStudyOverview.do',
             formdata={'areaId':str(areaId)}, 
             callback=self.parse_study)
         study_form_request.meta['study'] = study
         study_form_request.meta['areaId'] = areaId
         study_form_requests.append(study_form_request)
     return study_form_requests
Example #24
0
 def parse(self, response):
     return FormRequest.from_response(
                         response,
                         formdata=data,
                         method="GET",
                         callback=self.search_result
                         )
Example #25
0
 def login(self, response):
     with open(self.login_path, encoding='utf8') as f:
         login_data = json.load(f)
     yield FormRequest.from_response(response,
                                     formdata=login_data,
                                     formid='login-nav',
                                     callback=self.check_login)
Example #26
0
 def after_login(self, response):
     if 'BIN Lookup' in response.body:
         for binnum in self.bins:
             binnum = binnum.strip()
             yield FormRequest.from_response(
                 response, formdata={'search': binnum},
                 callback=self.got_bin)
Example #27
0
 def parse(self, response):
     if 'login' in response.url:
         return [FormRequest.from_response(
             response,
             formdata={'j_username': self.username,
                       'j_password': self.password},
             callback=self.after_login)]
Example #28
0
 def login(self, response):
     self._log_page(response, 'amazon_login.html')
     return [FormRequest.from_response(response, \
                         formdata = self.formdata,\
                         headers = self.headers,\
                         meta = {'cookiejar':response.meta['cookiejar']},\
                         callback = self.parse_item)]#success login
Example #29
0
 def parse(self, response):
     yield FormRequest.from_response(
         response,
         formname='frmConsulta',
         formdata={'data':'(@DTDE >="'+self.iDate+'") E (@DTDE <="'+self.fDate+'")', 'b':'ACOR'},
         callback=self.parseSearchResults
     )
Example #30
0
 def course_terms_page(self, response):
     for term in self.terms:
         yield FormRequest.from_response(
                 response,
                 formxpath='/html/body/div[5]/form',
                 formdata={'TERM': term},
                 meta={'term': term})
Example #31
0
    def traffic_fines_details(self, response):
        """Fines page with details
        Chose the records between start_date and end_date
        If not specified then choose all reqords."""

        renavam = response.selector.xpath(
            "//span[@id='lblRenavam']/text()").get("").strip()
        placa = response.selector.xpath("//span[@id='lblPlaca']/text()").get(
            "").strip()
        file_type = self.remove_diacritics(
            response.selector.xpath("//span[@id='LblCabecalho01']/text()").get(
                "").strip())
        print("renavam:", renavam)
        print("placa:", placa)
        print("file_type:", file_type)

        # Get options for request
        EVENTTARGET = response.selector.xpath(
            "//input[@id='__EVENTTARGET']/@value").get("")
        EVENTARGUMENT = response.selector.xpath(
            "//input[@id='__EVENTARGUMENT']/@value").get("")
        PageProdamSPOnChange = response.selector.xpath(
            "//input[@id='PageProdamSPOnChange']/@value").get("")
        PageProdamSPPosicao = response.selector.xpath(
            "//input[@id='PageProdamSPPosicao']/@value").get("")
        PageProdamSPFocado = response.selector.xpath(
            "//input[@id='PageProdamSPFocado']/@value").get("")
        VIEWSTATE = response.selector.xpath(
            "//input[@id='__VIEWSTATE']/@value").get("")
        VIEWSTATEGENERATOR = response.selector.xpath(
            "//input[@id='__VIEWSTATEGENERATOR']/@value").get("")
        EVENTVALIDATION = response.selector.xpath(
            "//input[@id='__EVENTVALIDATION']/@value").get("")
        btnGerarDocumento = response.selector.xpath(
            "//input[@id='btnGerarDocumento']/@value").get("")
        txthvalor_total = response.selector.xpath(
            "//input[@id='txthvalor_total']/@value").get("")
        txthqtd_total = response.selector.xpath(
            "//input[@id='txthqtd_total']/@value").get("")

        frm_data = {
            'PageProdamSPOnChange': PageProdamSPOnChange,
            'PageProdamSPPosicao': PageProdamSPPosicao,
            'PageProdamSPFocado': PageProdamSPFocado,
            '__EVENTTARGET': EVENTTARGET,
            '__EVENTARGUMENT': EVENTARGUMENT,
            '__VIEWSTATE': VIEWSTATE,
            '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'chkSelecionarTodos': 'on',
            'btnGerarDocumento': btnGerarDocumento,
            'txthvalor_total': txthvalor_total,
            'txthqtd_total': txthqtd_total
        }

        rows = response.selector.xpath("//table[@id='grdDados']//tr[@class]")
        all_rows_data = []
        for row in rows:
            # check if infringement_date between start_date and end_date
            # if they are not specified then get all records
            infringement_date = row.xpath(".//td[6]/text()").get("").strip()
            infringement_datetime = dt.strptime(infringement_date, "%d/%m/%Y")
            if self.start_date <= infringement_datetime <= self.end_date:

                # choose the record
                chkMulta = row.xpath(".//td[1]/span/input/@name").get("")
                hdnSituacaoPPM = row.xpath(".//td[1]/input/@name").get("")
                frm_data.update({chkMulta: "on", hdnSituacaoPPM: ""})

                # get fields
                notification = row.xpath(".//td[3]/text()").get("").strip()
                infringement = row.xpath(".//td[4]/text()").get("").strip()
                description = row.xpath(".//td[5]/text()").get("").strip()
                infringement_time = row.xpath(".//td[7]/text()").get(
                    "").strip()
                location = row.xpath(".//td[8]/text()").get("").strip()
                due_date = row.xpath(".//td[9]/text()").get("").strip()
                value = row.xpath(".//td[10]/span/text()").get("").strip()
                debt_situation = row.xpath(".//td[11]/text()").get("").strip()
                installment_code = row.xpath(".//td[12]/text()").get(
                    "").strip()
                situation_description = row.xpath(".//td[13]/text()").get(
                    "").strip()
                date = row.xpath(".//td[14]/text()").get("").strip()

                row_data = {
                    "notificacao": notification,
                    "auto_infracao": infringement,
                    "descricao": description,
                    "data_infracao": infringement_date,
                    "hora": infringement_time,
                    "local_da_infracao": location,
                    "vencimento": due_date,
                    "valor": value,
                    "situacao_na_divida_ativa": debt_situation,
                    "codigo_do_parcelamento": installment_code,
                    "descricao_da_situacao": situation_description,
                    "data": date
                }
                all_rows_data.append(row_data)
        # add data to result
        if all_rows_data:
            self.result.update({file_type: all_rows_data})
            # check if get_files is True
            if self.get_files:
                report_url = "https://meuveiculo.prefeitura.sp.gov.br/forms/frmResumoMultasDetalhe.aspx"
                yield FormRequest(url=report_url,
                                  formdata=frm_data,
                                  meta={
                                      "file_type": "boleto",
                                      "result_key": file_type,
                                      "notification": notification
                                  },
                                  callback=self.report_table,
                                  dont_filter=True)
        else:
            error_msg = "traffic_fines_details doesn't contain any data."
            self.logger.warning(error_msg)
Example #32
0
    def parse_product(self, response):
        """
        Main parsing product method
        """

        reqs = []
        product = response.meta['product']

        #self._populate_from_html(response, product)
        #self._populate_from_js(response, product)

        # Product ID
        id = re.findall('\/(\d+)', response.url)
        product_id = id[-1] if id else None
        response.meta['product_id'] = product_id

        if response.status in self.default_hhl:
            product = response.meta.get("product")
            product.update({"locale": 'en_CA'})
            return product

        self._populate_from_js(response, product)

        # Send request to get if limited online status
        try:
            skus = [{"skuid": sku} for sku in response.meta['skus']]
            request_data = [{
                "productid": product_id,
                "skus": [skus]
            }]

            request_data = json.dumps(request_data).replace(' ', '')

            reqs.append(FormRequest(
                url="http://www.walmart.ca/ws/online/products",
                formdata={"products": request_data},
                callback=self._parse_online_status,
                headers={
                    'X-Requested-With': 'XMLHttpRequest'
                }
            ))
        except KeyError:
            pass
        if response.xpath('//span[@class="infoText"]/' \
                          'text()').re('This product is not available'):
            product['no_longer_available'] = True

        self._populate_from_html(response, product)

        cond_set_value(product, 'locale', 'en_CA')  # Default locale.

        # Get featured products from generated JS script, evaluating parent script
        RR_entity = RR(
            response.url,
            product_id,
            response
        )
        featured_products_url = RR_entity.js()

        reqs.append(Request(
            url=featured_products_url,
            callback=self._parse_related_products
        ))

        # Get product base info, QA and reviews straight from JS script
        product_info_url = self.PRODUCT_INFO_URL.format(product_id=product_id)
        reqs.append(Request(
            url=product_info_url,
            callback=self._parse_product_info
        ))

        regex = "\/(\d+)\??"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Example #33
0
 def start_requests(self):
     return [FormRequest("https://slashdot.org/my/login", formdata={"op": "userlogin", "returnto": "", "unickname": "<username>", "upasswd": "<passwd>", "userlogin": "******"})]
    def get_recaptchaClientToken(self, response):
        resp_cnt = response.meta['resp_cnt']

        print("\t[{}] {}".format(resp_cnt, response.text))
        if response.text.split('|')[0] == 'OK':
            self.recaptchaClientToken = response.text.split('|')[1]

            headers = {
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'same-origin',
                'Sec-Fetch-User': '******',
                'cookie': self.cookie
            }
            payload = {
                '__RequestVerificationToken': self.__RequestVerificationToken,
                'Registreringsnummer': self.Registreringsnummer,
                'recaptchaClientToken': self.recaptchaClientToken,
                'Captcha.CaptchaResponse': ''
            }

            request = FormRequest(url=self.post_url,
                                  method='POST',
                                  headers=headers,
                                  formdata=payload,
                                  callback=self.get_details,
                                  errback=self.fail_details,
                                  dont_filter=True,
                                  meta={})
            yield request

        else:
            resp_cnt += 1
            if resp_cnt >= self.max_resp_cnt:
                formdata = {
                    'key': self.api_key,
                    'method': 'userrecaptcha',
                    'googlekey': self.google_key,
                    'pageurl': self.post_url,
                }

                headers = make_headers_1()
                headers['cookie'] = self.cookie

                request = FormRequest(url=self.captcha_in_url,
                                      method='POST',
                                      formdata=formdata,
                                      headers=headers,
                                      callback=self.get_captcha_id,
                                      errback=self.fail_captcha_id,
                                      dont_filter=True,
                                      meta={})
                yield request
            else:
                sleep(self.resp_time)
                fetch_url = self.captcha_res_url.format(self.captcha_id)
                request = FormRequest(url=fetch_url,
                                      method='GET',
                                      headers=make_headers_1(),
                                      callback=self.get_recaptchaClientToken,
                                      errback=self.fail_recaptchaClientToken,
                                      dont_filter=True,
                                      meta={
                                          'resp_cnt': resp_cnt,
                                      })
                yield request
Example #35
0
def send_myapks_request(url, **kwargs):
    apk_name = kwargs['apk_name']
    return FormRequest(url,
                       method='GET',
                       meta=kwargs,
                       callback=get_myapks_search_formdata)
 def parse(self, response):
     # print self.start_urls
     #open_in_browser(response)
     allyears = ret0IfExist(Selector(response).xpath(YEARS_XPATH).extract())
     curyear = Selector(response).xpath(CURYEAR_XPATH).extract()
     if (curyear[0] and len(curyear[0])):
         curyear = curyear[0]
         curyear = curyear[:4]
         if (len(curyear) == 0):
             return
     data = Selector(response).xpath(TEXT_XPATH8).extract()
     #print(data)
     if (len(data) == 0):
         data = Selector(response).xpath(TEXT_XPATH6).extract()
         if (len(data) == 0):
             return
     data = ret0IfExist(data)
     # print(allyears);
     # print(curyear);
     #print(data);
     conn = MySQLdb.connect(user='******',
                            passwd='stocks_pass',
                            db='stocks',
                            host='localhost',
                            charset="utf8",
                            use_unicode=True)
     cursor = conn.cursor()
     finData = MoneycontrolItem()
     finData['symbol'] = str(response.meta['symbol'])
     finData['type'] = 'ds'  # directors speech
     finData['year'] = str(curyear)
     finData['data'] = data
     print('inserting')
     print(finData['symbol'])
     print(finData['type'])
     print(finData['year'])
     print('trying now')
     try:
         cursor.execute(
             "insert into stock_txt_data (symbol,data,type,year) values (%s,%s,%s,%s)",
             (finData['symbol'], finData['data'], finData['type'],
              finData['year']))
     except:
         print("Unexpected error:", sys.exc_info())
         cursor.close()
     #print(finData);
     conn.commit()
     if (allyears is None):
         return
     allyears = allyears.split(",")
     #all the other years have already been yielded in GET request
     #so post request should not yield more requests
     if (response.meta['posted']):
         return
     for y in allyears:
         if (len(y) == 0):
             continue
         try:
             tempy = int(y[:4])
             y = int(y)
             curyear = int(curyear)
             if (tempy <= curyear):
                 continue
         except:
             continue
         headers = {
             'content-type':
             "application/x-www-form-urlencoded",
             'cache-control':
             "no-cache",
             'User-Agent':
             'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/50.0.2661.102 Chrome/50.0.2661.102 Safari/537.36',
             'Content-Type':
             'application/x-www-form-urlencoded',
             'Referer':
             'http://www.moneycontrol.com/annual-report/infosys/chairmans-speech/IT',
         }
         formdata = {
             'sel_year': str(y),
             'sc_did': str(response.meta['mc_symbol'])
         }
         print(formdata)
         response.meta['posted'] = True
         # use this flag to post again or not
         yield FormRequest(response.url,
                           method='POST',
                           meta=response.meta,
                           headers=headers,
                           formdata=formdata,
                           callback=self.parse)
Example #37
0
    def parse_product(self, response):
        products = response.xpath(
            '//div[contains(@class, "product")]//a[div[@class="name"]]/@href'
        ).extract()
        if products:
            for product in products:
                yield Request(response.urljoin(product),
                              callback=self.parse_product)

            pages = response.xpath(
                '//a[contains(@class, "pageNumber")]/text()').extract()
            for page in pages:
                page = response.urljoin(page)
                yield Request(page)

            return

        name = response.xpath('//div/h1/text()').extract()
        try:
            price = response.xpath(
                '//div[@class="bigprice GBP"]/@data-price').extract()[0]
        except IndexError:
            for p in self.parse(response):
                yield p
            return

        brand = ''
        categories = response.xpath(
            '//ul[@class="breadcrumb"]/li/a/text()').extract()[1:]

        l = ProductLoader(item=Product(), response=response)

        image_url = response.xpath('//div[@id="mainImage"]/img/@src').extract()
        image_url = response.urljoin(image_url[0]) if image_url else ''
        l.add_value('image_url', image_url)
        l.add_value('url', response.url)
        l.add_value('name', name)
        l.add_value('price', extract_price(price))
        l.add_value('brand', brand)
        l.add_value('category', categories)
        sku = response.xpath('//p[@class="partcode"]/text()').re(
            'Quick Code: (.*)')
        sku = sku[0] if sku else ''
        l.add_value('sku', sku)
        l.add_xpath('identifier', '//input[@name="product_id"]/@value')

        item = l.load_item()

        promotions = response.xpath(
            '//div[contains(@class, "price_box")]//div[@class="GBP"]/span[@class="desktop_rrp" or @class="saving"]/text()'
        ).extract()

        corner_promotion = response.xpath(
            '//img[@class="cornerflash"]/@src').re('Empire/(.*).png')
        corner_promotion = corner_promotion[0] if corner_promotion else ''

        corner_promotions = {
            'pricedrop': 'Price Drop',
            'deal': 'Deal',
            'freedel': 'Free Delivery',
            'newarrival': 'New Arrival',
            'sale': 'Sale',
            'bestseller': 'Bestseller',
            'wasteincluded': 'Waste Included',
            'trayincluded': 'Tray Included',
            'clearance': 'Clearance',
            'pricedropred': 'Price Drop',
            'asseenontv': 'As Seen On T.V'
        }

        metadata = MetaData()
        metadata['corner_promotion'] = corner_promotions.get(
            corner_promotion, '')
        metadata['Promotions'] = ' '.join(promotions) if promotions else ''
        item['metadata'] = metadata

        stock_url = "http://soak.com/includes/ajax/in_stock.php"
        part_code = response.xpath(
            '//div[contains(@class, "stock_report")]/@data-partcode').extract(
            )[0]
        manufacturers_id = response.xpath(
            '//div[contains(@class, "stock_report")]/@data-manufacturers_id'
        ).extract()[0]
        formdata = {
            'action': 'in_stock',
            'manufacturers_id': manufacturers_id,
            'part_code': part_code
        }

        yield FormRequest(stock_url,
                          formdata=formdata,
                          callback=self.parse_stock,
                          meta={'item': item})
Example #38
0
 def start_requests(self):
     for contentID in self.crawlID:
         self.finishID.add(contentID)
         self.SelectedID = contentID
         return [FormRequest(url = "http://weibo.cn/search/", formdata = {'keyword': contentID, 'smblog': '搜微博'}, callback=self.parse_Content)]
Example #39
0
    def parse_make(self, response):
        hxs = HtmlXPathSelector(response)
        row = response.meta['row']
        form = hxs
        base_data = self.get_post_data(response)

        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ValidCar'] = 'rbYes'
        base_data[
            '__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$lnkbtnSizeGo2'
        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = row[
                'IP code']

        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL'] = row[
                'Speed rating']
        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL'] = row[
                'Rim']
        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL'] = row[
                'Aspect Ratio']
        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'] = row[
                'Width']
        base_data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = '2151953'

        rewrite_keys = [
            ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL',
             'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvWidth'),
            ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL',
             'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvProfile'),
            ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL',
             'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvDiameter'),
            ('ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL',
             'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvSpeed'),
        ]
        for field1, field2 in rewrite_keys:
            base_data[field2] = base_data[field1]

        if not self.makes:
            makes = form.select(
                ".//select[@id='ddlMake']/option/@value").extract()
            self.makes = [x for x in makes if x != '0']

        for i, make in enumerate(self.makes):
            if make in self.current_row_processed_makes:
                continue
            self.log("Crawling row for make: %s" % make)
            self.current_row_processed_makes.add(make)
            data = base_data.copy()

            data[
                'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlMake'] = make

            req = FormRequest(response.url,
                              formdata=sorted(data.items()),
                              callback=self.pre_parse_search,
                              dont_filter=True,
                              meta={
                                  'row': response.meta['row'],
                                  'formdata': data
                              })
            yield req
            return
        self.prepare_for_next_row()
Example #40
0
    def parse(self, response):
        row = self.current_row
        if not row:
            row = self.get_next_row()
        if not row:
            self.done = True
            return
        self.log("[CARTYRES] Searching row: %s" % str(row))

        hxs = HtmlXPathSelector(response)
        form = hxs.select("//form[@id='form1']")
        data = {}
        for el in form.select(".//input"):
            key = el.select("@name").extract()
            value = el.select("@value").extract()
            if key:
                key = key[0]
                if not key.startswith('__'):
                    continue
                if value:
                    value = value[0]
                else:
                    value = ''
                data[key] = value

        for el in form.select(".//select"):
            key = el.select("@name").extract()
            value = el.select(".//option[@selected]/@value").extract()
            if key:
                key = key[0]
                if value:
                    value = value[0]
                else:
                    value = ''
                data[key] = value
        data['__ASYNCPOST'] = 'true'
        data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ValidCar'] = 'rbYes'
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlMake'] = '0'
        data[
            '__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'
        data['ctl00$sp'] = \
             'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$upPanel|ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'

        # use row to fill in data
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlWidthL'] = row[
            'Width']
        data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlDiameterL'] = row[
                'Rim']
        data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlProfileL'] = row[
                'Aspect Ratio']

        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$ddlSpeedL'] = 'V'
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvDiameter'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvProfile'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvSpeed'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$hvWidth'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeL'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtPostCodeR'] = ""
        data['ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtRegNo'] = ""
        data[
            'ctl00$ContentPlaceHolder1$FindTyreSliderHome1$txtValidRegNo'] = ""

        req = FormRequest(response.url,
                          formdata=sorted(data.items()),
                          callback=self.parse2,
                          errback=self.error_callback,
                          dont_filter=True,
                          meta={
                              'row': row,
                              'formdata': data
                          })
        yield req
Example #41
0
 def parse_form(self, response):
     yield FormRequest.from_response(response,
                                     formdata={'RdoTimeLimit': '42'},
                                     dont_filter=True,
                                     formxpath='(//form)[2]',
                                     callback=self.parse_pages)
Example #42
0
    def input_process(self, response):
        action = response.xpath('//form[@name="Bkkn001Form"]/@action').extract()[0]
        req_url = urljoin(response.url, action)
        token = response.xpath('//input[@name="org.apache.struts.taglib.html.TOKEN"]/@value').extract()[0]

        self.headers['Referer'] = response.url

        random_id = response.xpath('//input[@id="randomID"]/@value').extract()[0]
        seni_gen_gameen_id = response.xpath('//input[@name="seniGenGamenID"]/@value').extract()[0]

        form_data = {
            'org.apache.struts.taglib.html.TOKEN': token,
            'randomID': random_id,
            'contextPath': '/reins',
            'event': 'forward_searchbabi',
            'bbTtKbn': '1',
            'stateMode': '',
            'stWttBg': '',
            'hzMi': '',
            'zkSyKbn': '2',
            'stJyk': '',
            'bkknShmk1': '',
            'bkknShmk2': '',
            'bkknShbt1': '03',
            'bkknShmkDispList1': '',
            'bkknShmkDispList1': '',
            'bkknShmkDispList1': '',
            'bkknShmkDispList1': '',
            'bkknShmkDispList1': '',
            'bkknShmkDispList1': '',
            'bkknShbt2': '',
            'bkknShmkDispList2': '',
            'bkknShmkDispList2': '',
            'bkknShmkDispList2': '',
            'bkknShmkDispList2': '',
            'bkknShmkDispList2': '',
            'shtkChkKbnShti': '0',
            'shkcknShriSti': '0',
            'shgUmKbn': '1',
            'trhkJyukyu': '0',
            'tdfkMi1': '東京都',
            'shzicmi1_1': '',
            'shzicmi2_1': '',
            'shzicJyk_1': '1',
            'ttmnmi_1': '',
            'ttmnJyk_1': '1',
            'tdfkMi2': '',
            'shzicmi1_2': '',
            'shzicmi2_2': '',
            'shzicJyk_2': '1',
            'ttmnmi_2': '',
            'ttmnJyk_2': '1',
            'tdfkMi3': '',
            'shzicmi1_3': '',
            'shzicmi2_3': '',
            'shzicJyk_3': '1',
            'ttmnmi_3': '',
            'ttmnJyk_3': '1',
            'ensnmi1': '',
            'ekmiFrom1': '',
            'ekmiTo1': '',
            'thNyrkt1': '',
            'thMbKbn1': '',
            'krmKm1': '',
            'bsB1': '',
            'ensnmi2': '',
            'ekmiFrom2': '',
            'ekmiTo2': '',
            'thNyrkt2': '',
            'thMbKbn2': '',
            'krmKm2': '',
            'bsB2': '',
            'ensnmi3': '',
            'ekmiFrom3': '',
            'ekmiTo3': '',
            'thNyrkt3': '',
            'thMbKbn3': '',
            'krmKm3': '',
            'bsB3': '',
            'bsRsmi': '',
            'bsTmiSh': '',
            'tihNyrkt': '',
            'tihMbKbn': '',
            'sotKtu': '',
            'sotKtuNyrkt': '',
            'sotKtuMbKbn': '',
            'kkkuCnryuFrom': '',
            'kkkuCnryuTo': '',
            'siykKkkuCnryuFrom': '',
            'siykKkkuCnryuTo': '',
            'tbTnkFrom': '',
            'tbTnkTo': '',
            'siykTbTnkFrom': '',
            'siykTbTnkTo': '',
            'tcMnskFrom': '',
            'tcMnskTo': '',
            'ttmnMnskFrom': '',
            'ttmnMnskTo': '',
            'snyuMnskFrom': '',
            'snyuMnskTo': '',
            'mdrHysuFrom': '',
            'mdrHysuTo': '',
            'shzikiFrom': '',
            'shzikiTo': '',
            'blcnyHuku': '',
            'stdoHuku': '',
            'stdoJyukyu': '',
            'stdoStmn': '',
            'stdoFkin': '',
            'tskikk': '',
            'yutCik': '',
            'sitkYut': '',
            'ktcJok': '',
            'chushjyuZih': '',
            'cknngtYearFrom': '',
            'cknngtMonthFrom': '',
            'cknngtYearTo': '',
            'cknngtMonthTo': '',
            'kjkrngGgFrom': 'R',
            'kjkrngYearFrom': '',
            'kjkrngMonthFrom': '',
            'kjkrngGgTo': 'R',
            'kjkrngYearTo': '',
            'kjkrngMonthTo': '',
            'optId': '',
            'strStbJok': '',
            'bk1': '',
            'shuhnKnkyu': '',
            'turkKknFlg': '1',
            'turkNngppGgFrom': 'R',
            'turkNngppNenFrom': '',
            'turkNngppGatuFrom': '',
            'turkNngppHiFrom': '',
            'turkNngppGgTo': 'R',
            'turkNngppNenTo': '',
            'turkNngppGatuTo': '',
            'turkNngppHiTo': '',
            'hcKknFlg': '1',
            'hnkuNngppGgFrom': 'R',
            'hnkuNngppNenFrom': '',
            'hnkuNngppGatuFrom': '',
            'hnkuNngppHiFrom': '',
            'hnkuNngppGgTo': 'R',
            'hnkuNngppNenTo': '',
            'hnkuNngppGatuTo': '',
            'hnkuNngppHiTo': '',
            'siykKknFlg': '5',
            'siykNngppGgFrom': 'R',
            'siykNngppNenFrom': '1',
            'siykNngppGatuFrom': '7',
            'siykNngppHiFrom': '1',
            'siykNngppGgTo': 'R',
            'siykNngppNenTo': '1',
            'siykNngppGatuTo': '7',
            'siykNngppHiTo': '2',
            'siykTurkKknFlg': '1',
            'siykTurkNngppGgFrom': 'R',
            'siykTurkNngppNenFrom': '',
            'siykTurkNngppGatuFrom': '',
            'siykTurkNngppHiFrom': '',
            'siykTurkNngppGgTo': 'R',
            'siykTurkNngppNenTo': '',
            'siykTurkNngppGatuTo': '',
            'siykTurkNngppHiTo': '',
            'seniMotFlg': '',
            'seniGenGamenID': seni_gen_gameen_id
        }

        return FormRequest(
            url=req_url,
            method='POST',
            formdata=form_data,
            callback=self.parse_list_page,
            headers=self.headers,
            dont_filter=True
        )
Example #43
0
 def select(self, response):
     yield FormRequest.from_response(response,
                                     formdata=self.params,
                                     dont_filter=True,
                                     callback=self.submit)
    def parse_get_email_read(self, response):
        # 读取邮件(每页20条)
        print('返回的地址:')
        print(response.url)
        # print('返回的内容:')
        # print(response.body.decode('utf-8'))
        # 获取页面索引
        _page_no_mark = 'var pageNo = '
        _pageNo = response.xpath(
            '/html/head/script[5]').extract_first()
        pageNo = _pageNo[_pageNo.find(_page_no_mark):_pageNo.find(
            ';', _pageNo.find(_page_no_mark))].replace(_page_no_mark, '')
        pageNo = int(pageNo)

        base_url = "https://mail.263.net/wm2e/mail/mailOperate/mailOperateAction_mailInfo.do"
        indexNum = 1  # 选中的邮件缓存中索引

        for item in range(20):
            index = item+1
            emailIdentity = response.xpath(
                '//*[@id="contList2"]/ul[{0}]/li[1]/span[2]/input/@value'.format(index)).extract_first()
            # emailIdentity=response.xpath(
            # '//*[@id = "contList2"]/ul[{0}]/li[1]/span[2]/input/text()'.format(index)).extract()

            indexNum = (pageNo-1)*20+index
            if indexNum > self.emial_num:
                break
            params_url = "mailOperateType=read"
            params_url = params_url+"&emailIdentity="+emailIdentity
            params_url = params_url+"&selfFolderId=10"
            params_url = params_url+"&usr="******"&sid="+self.sid
            params_url = params_url+"&statFlag=2"
            params_url = params_url+"&starred=0"
            params_url = params_url+"&waited=0"
            params_url = params_url+"&floderType=10"
            params_url = params_url+"&indexNum="+str(indexNum)
            params_url = params_url+"&reachStoragePoint=true"
            params_url = params_url+"&undoSend="
            params_url = params_url+"&encryptMail=false"
            params_url = params_url+"&mailPasswdStatus=0"
            params_url = params_url+"&securityMark=0"
            params_url = params_url+"&securityType=0"
            params_url = params_url+"&frameJump=1"

            url = base_url+'?'+params_url

            formdata = {}
            if self.Emailmode == EmailMode.FILED:  # 从归档收件箱中获取
                formdata = {
                    "pageNo": str(pageNo),
                    "qstr": "",
                    "sortStr": '{"time":"desc"}',
                    "fstr": "{}",
                    "folderId": "10",
                    "type": "10",
                    "fullSearchIfmIsHide": "null",
                }
            elif self.Emailmode == EmailMode.QUERY:  # 从筛选的收件箱中获取
                formdata = {
                    "pageNo": str(pageNo),
                    "qstr": "{ \"ifQuick\" : \"0\" , \"sender\" : \"sales08\" }",
                    "sortStr": "{\"time\":\"desc\"}",
                    "fstr": "{}",
                    "folderId": "",
                    "type": "",
                    "fullSearchIfmIsHide": "null",
                }
            else:
                pass

            email_list = FormRequest(
                url, formdata=formdata, callback=self.parse_get_email_info)
            yield email_list
        return
    def parse_get_email_list_by_emial_filed(self, response):
        # 从归档的邮件中获取邮件列表信息
        # pageNo = str(response.request.body,
        #              'utf-8').split('&')[0].split('=')[1]  # 获取页面索引
        _page_no_mark = 'var pageNo = '
        _pageNo = response.xpath(
            '/html/head/script[5]').extract_first()
        pageNo = _pageNo[_pageNo.find(_page_no_mark):_pageNo.find(
            ';', _pageNo.find(_page_no_mark))].replace(_page_no_mark, '')
        pageNo = int(pageNo)

        _email_num_mark = 'var total ='
        _emial_num = response.xpath(
            '/html/head/script[5]').extract_first()
        emial_num = _emial_num[_emial_num.find(_email_num_mark):_emial_num.find(
            ';', _emial_num.find(_email_num_mark))].replace(_email_num_mark, '')
        self.emial_num = int(emial_num)
        print('查找到邮件封数:%s' % self.emial_num)
        base_url = "https://mail.263.net/wm2e/mail/mailIndex/mailIndexAction_indexList.do"
        url = (
            base_url+"?usr={0}&sid={1}&12").format(self.user_email, self.sid)
        for item in range(self.page_num):
            pageNo = item+1
            email_list = FormRequest(url, formdata={
                "pageNo": str(pageNo),
                "qstr": '{ "ifQuick" : "0" , "sender" : "sales08" }',
                "sortStr": '{"time":"desc"}',
                "fstr": "{}",
                "folderId": "",
                "type": "",
                "fullSearchIfmIsHide": "null",
            }, callback=self.parse_get_email_read)
            yield email_list

        # base_url = "https://mail.263.net/wm2e/mail/mailOperate/mailOperateAction_mailInfo.do"
        # indexNum = 1  # 选中的邮件缓存中索引

        # for item in range(20):
        #     index = item+1
        #     emailIdentity = response.xpath(
        #         '//*[@id="contList2"]/ul[{0}]/li[1]/span[2]/input/@value'.format(index)).extract_first()
        #     # emailIdentity=response.xpath(
        #     # '//*[@id = "contList2"]/ul[{0}]/li[1]/span[2]/input/text()'.format(index)).extract()

        #     indexNum = (pageNo-1)*20+index
        #     if indexNum > self.emial_num:
        #         break
        #     params_url = "mailOperateType=read"
        #     params_url = params_url+"&emailIdentity="+emailIdentity
        #     params_url = params_url+"&selfFolderId=10"
        #     params_url = params_url+"&usr="******"&sid="+self.sid
        #     params_url = params_url+"&statFlag=2"
        #     params_url = params_url+"&starred=0"
        #     params_url = params_url+"&waited=0"
        #     params_url = params_url+"&floderType=10"
        #     params_url = params_url+"&indexNum="+str(indexNum)
        #     params_url = params_url+"&reachStoragePoint=true"
        #     params_url = params_url+"&undoSend="
        #     params_url = params_url+"&encryptMail=false"
        #     params_url = params_url+"&mailPasswdStatus=0"
        #     params_url = params_url+"&securityMark=0"
        #     params_url = params_url+"&securityType=0"
        #     params_url = params_url+"&frameJump=1"

        #     url = base_url+'?'+params_url
        #     email_list = FormRequest(url, formdata={
        #         "pageNo": str(pageNo),
        #         "qstr": "{}",
        #         "sortStr": "{\"time\":\"desc\"}",
        #         "fstr": "{}",
        #         "folderId": "10",
        #         "type": "10",
        #         "fullSearchIfmIsHide": "null",
        #     }, callback=self.parse_get_email_info)
        #     yield email_list
        return
Example #46
0
 def parse(self, response):
     yield FormRequest.from_response(response,
                                     formname='naam',
                                     formdata={"naam": artist_name_search},
                                     callback=self.parse1)
Example #47
0
 def login(self, response):
     return FormRequest.from_response(
         response,
         formdata={'user': input('Username: '******'pwd': getpass.getpass('Password: ')},
         callback=self.check_login_response
     )
 def start_requests(self):
     for i, url in enumerate(self.start_urls):
         yield FormRequest(url, meta={'cookiejar': i}, \
                           headers=self.headers, \
                           cookies=self.cookies,
                           callback=self.parse_item)  # jump to login page
    def get_details(self, response):
        if response.xpath(
                '//script[contains(@src, "https://www.google.com/recaptcha/api")]/@src'
        ):
            print(f"\t[{self.Registreringsnummer}] Captcha is found")

            formdata = {
                'key': self.api_key,
                'method': 'userrecaptcha',
                'googlekey': self.google_key,
                'pageurl': self.post_url,
            }

            headers = make_headers_1()
            headers['cookie'] = self.cookie
            request = FormRequest(url=self.captcha_in_url,
                                  method='POST',
                                  formdata=formdata,
                                  headers=headers,
                                  callback=self.get_captcha_id,
                                  errback=self.fail_captcha_id,
                                  dont_filter=True,
                                  meta={})
            yield request
        else:
            try:
                Försäkringsbolag = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "kringsbolag")]/../text()').extract() if
                     elm.strip()][
                        0].strip()
            except:
                Försäkringsbolag = ''
            try:
                Försäkringsdatum = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "kringsdatum")]/../text()').extract() if
                     elm.strip()][
                        0].strip()
            except:
                Försäkringsdatum = ''

            try:
                Fordonsstatus = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-fordonsstatus"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Fordonsstatus = ""
            try:
                Besiktigas_senast_8 = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Besiktigas senast")]/../text()'
                    ).extract() if elm.strip()
                ][-2].strip()
            except:
                Besiktigas_senast_8 = ""
            try:
                Upplysningar = \
                    [elm.strip() for elm in
                     response.xpath('//strong[contains(text(), "Upplysningar")]/../text()').extract()
                     if elm.strip()][0].strip()
            except:
                Upplysningar = ""
            try:
                Import_införsel = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-import"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Import_införsel = ""
            try:
                Besiktigas_senast = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Besiktigas senast")]/../text()'
                    ).extract() if elm.strip()
                ][-1].strip()
            except:
                Besiktigas_senast = ""
            try:
                Senast_godkända_besiktning = [
                    elm.strip() for elm in response.xpath(
                        '//strong[contains(text(), "Senast god") and contains(text(), "besiktning")]/../text()'
                    ).extract() if elm.strip()
                ][0].strip()
            except:
                Senast_godkända_besiktning = ""
            try:
                Mätarställning = \
                    [elm.strip() for elm in response.xpath('//a[@href="#ts-matarstallning"]/../../text()').extract() if
                     elm.strip()][0].strip()
            except:
                Mätarställning = ""

            item = FuRegnrItem()
            item['Registreringsnummer'] = self.Registreringsnummer
            item['Försäkringsbolag'] = Försäkringsbolag
            item['Försäkringsdatum'] = Försäkringsdatum
            item['Fordonsstatus'] = Fordonsstatus
            item['Besiktigas_senast_8'] = Besiktigas_senast_8
            item['Upplysningar'] = Upplysningar
            item['Import_införsel'] = Import_införsel
            item['Besiktigas_senast'] = Besiktigas_senast
            item['Senast_godkända_besiktning'] = Senast_godkända_besiktning
            item['Mätarställning'] = Mätarställning

            yield item

            result_row = [
                self.Registreringsnummer, Försäkringsbolag, Försäkringsdatum,
                Fordonsstatus, Besiktigas_senast_8, Upplysningar,
                Import_införsel, Besiktigas_senast, Senast_godkända_besiktning,
                Mätarställning
            ]
            self.total_cnt += 1
            print("\t[Result {}] {}".format(self.total_cnt, result_row))
            self.insert_row(result_row=result_row)

            self.total_scraping_done = True

            while self.input_data:
                line = self.input_data.pop()
                if line:
                    break

            self.Registreringsnummer = line[18]

            self.total_cnt += 1
            print("[{}] Scanning ...".format(self.Registreringsnummer))
            self.total_scraping_done = False
            request = FormRequest(url=self.get_url,
                                  method='GET',
                                  headers=make_headers_1(),
                                  callback=self.get__RequestVerificationToken,
                                  errback=self.fail__RequestVerificationToken,
                                  dont_filter=True,
                                  meta={})
            yield request
Example #50
0
def send_cmb_request(url, **kwargs):
    apk_name = u'招商银行Android客户端'
    kwargs['apk_name'] = apk_name
    return FormRequest(url, method='GET', meta=kwargs, callback=get_cmb_detail)
Example #51
0
 def parse_welcome(self, response):
     return FormRequest.from_response(response,
                                      formdata={
                                          "user": "******",
                                          "pass": "******"
                                      })
Example #52
0
 def start_requests(self):
     key_words = ["python"]
     for key_word in key_words:
         url = "https://www.lagou.com/jobs/positionAjax.json?city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false"
         request = FormRequest(url,formdata={"first":"true","pn":"1","kd":key_word},headers=self.headers)
         yield request
Example #53
0
 def login(self, response):
     return FormRequest.from_response(response,
                 formdata={'id': '0000', 'password': '******'},
                 callback=self.check_login_response)
Example #54
0
 def parse_review(self, response):
     if response.status == 200:
         yield {"packageName": response.meta["package_name"], "review": response.body}
         yield FormRequest(self.review_url, callback=self.parse_review, formdata={
             "id": response.meta["package_name"], "reviewType": '0', "reviewSortOrder": '0', "pageNum": str(response.meta["page"] + 1)
         }, meta={"package_name": response.meta["package_name"], "page": response.meta["page"] + 1})
Example #55
0
  def parse(self, response):
    sel = Selector(response)
    url = 'http://www.roadrunnersports.com/rrs/product-detail/build-selections.jsp'
    item = BigCItem()
    pname =  response.xpath("//meta[@property='og:title']/@content").extract()[0]
    item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0]

    if "Trail" in pname :
      item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " Running Shoe"
    
    
    mrp = float(sel.xpath("//span[@class='prod_detail_reg_price']/span/text()").extract()[0])
    
    item ["Retail_Price"]  = str((mrp*65 + mrp*30/100*70/100*65)*112.5/100 + mrp*65*15/100)
    item_sp               = response.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()
    
    if item_sp:
      sp = float(sel.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()[0].split("-")[-1].replace("$",""))
      item ["Sale_Price"]         = str((sp*65 + 30/100*70*65)*112.5/100 + sp*65*15/100)
    else:
      item ["Sale_Price"]         = ''
    #categorization
    cat     =  response.xpath("//div[@id='grp_1']/p/span[1]/text()")
    sex =  response.xpath("//meta[@property='og:title']/@content").extract()[0]
    if sex in("Women's"):
      sex= "Women's"
    else:
      sex= "Men's"

    item["Product_Description"] = response.xpath("//div[@id='grp_1']/p").extract() + response.xpath("//div[@id='grp_1']/ul/li").extract()
       
    if cat:
#      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/" + sel.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","")
      cat= ";Shoes/"+sex+" Running Shoes/"+response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") +" Running Shoes"
      
      item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " " + response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0] + " Running Shoe"
    else:
      cat= ""

    if any("hiking" in s for s in item["Product_Description"]) or any("Hiking" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Hiking Shoes" + cat
    elif any("trail" in s for s in item["Product_Description"]) or any("Trail" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Trail Running Shoes" + cat
    elif any("minimalist" in s for s in item["Product_Description"]) or any("barefoot" in s for s in item["Product_Description"]) or any("Barefoot" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Barefoot Running Shoes" + cat
    elif any("spike" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Racing Spikes" + cat
    elif any("cross-train" in s for s in item["Product_Description"])or any("trainer" in s for s in item["Product_Description"])or any("training shoe" in s for s in item["Product_Description"]) or any("gym" in s for s in item["Product_Description"]) or any("workout" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Cross Training Shoes" + cat   
    else:
      if cat:
        item ["Category"] = "Run & Cycle/Running/Running Shoes"+ cat
      else:
        item ["Category"] = "NULL"
        
    item ["Brand_Name"]          = response.xpath("//span[@itemprop='brand']/text()").extract()[0]
    if item["Brand_Name"] in ("Asics","Mizuno","Brooks","Saucony","New Balance"):
       item ["Sort_Order"] = str(-300-(20/100*mrp))
    elif item["Brand_Name"] in ("Under Armour","Altra","Hoka One One","Inov8","Salomon","Vibram FiveFingers"):
        item ["Sort_Order"] = str(-270-(20/100*mrp))
    else :
      item ["Sort_Order"] = str(-250-(20/100*mrp))
      
    item["Product_Availability"] = "12-17 Working Days"
    item["Current_Stock"] = "100"
    item ["Free_Shipping"] = "N"
    item["Product_Image_Is_Thumbnail_1"] = "Y"
    item["Track_Inventory"] = "By Option"
    item["Product_Image_Sort_1"] = "1"
    item["Product_Image_Sort_2"] = "2"
    item["Product_Image_Sort_3"] = "3"
    item["Product_Image_Sort_4"] = "4"
    item["Product_Image_Sort_5"] = "5"
    
    item ["imageSetUrls"] = {}
    item ["imageSetUrls2"] = {}
    colors                = response.xpath("//a[@class='ref2QIColor']/@name").extract()
    item ["Product_Image_File1"]      = {}
    hrefs                 = response.xpath("//a[@class='ref2QIColor']/@href").extract()
    item ["color"]     = {}
    for idx,href in enumerate(hrefs):
      #create links to image sets
      if colors[idx] not in item ["imageSetUrls"]:
        item ["imageSetUrls"][colors[idx]] = []
      item ["imageSetUrls"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1")
      if colors[idx] not in item ["imageSetUrls2"]:
        item ["imageSetUrls2"][colors[idx]] = []
      item ["imageSetUrls2"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1")
      item ["color"][href.split('/')[-1].split('_')[0].split('-')[1]] = colors[idx]
      
    #request product info as json
    item ["sku"]          = response.url.strip('/').split('/')[-2]
    payload               = {'id':item ["sku"]}
    request               = FormRequest(url,formdata=payload,callback=self.parseJsonProduct)
    request.meta['item']  = item

    return request
Example #56
0
    def _parse_online_status(self, response):
        """
        Gets limited_stock and is_out_of_stock fields for product and its variants
        """
        meta = response.meta.copy()
        reqs = meta.get('reqs')
        product = meta['product']
        data = json.loads(
            response.body_as_unicode()
        )

        try:
            product_info = data['products'][0]
            variants_info = product_info['skus']
            variants = product['variants']
            final_variants = []
            list_out_of_stock = ['70', '80', '85', '87', '90']
            list_not_sold_online = ['85', '87', '90']

            # Set limited status for main product
            availability = product_info['availability']
            product['is_out_of_stock'] = availability in list_out_of_stock
            product['is_in_store_only'] = availability in list_not_sold_online

            if product_info['isLimitedStock']:
                product['limited_stock'] = True
            else:
                product['limited_stock'] = False

            #Taking right amount of price and availability status
            try:
                currency = re.findall('priceCurrency=(.*?),',str(product['price']))[0]
            except:
                currency = 'CAD'

            price = product_info['minCurrentPrice']
            if not price:
                prod_data = [{"productid":response.meta['product_id'],
                            "skus":[{"skuid":str(product['upc']),"storeeligible":True}]
                }]
                prod_data = json.dumps(prod_data).replace(' ', '')
                store_data = json.dumps(['1104','3057','1192','5777']).replace(' ', '')
                reqs.append(FormRequest(
                    url="http://www.walmart.ca/ws/store/products",
                    formdata={'stores':store_data, 'products':prod_data},
                    callback=self._parse_store_status,
                    headers={'X-Requested-With': 'XMLHttpRequest'}
                ))
            else:
                product['price'] = Price(priceCurrency=currency, price=price)

            # Set limited status for product variants
            if variants:
                for var in variants_info:
                    sku_id = var['skuId']

                    availability = var['availability']
                    variants[sku_id]['is_out_of_stock'] = availability in list_out_of_stock
                    variants[sku_id]['is_in_store_only'] = availability in list_not_sold_online

                    final_variants.append(variants[sku_id])

                product['variants'] = final_variants
        except (KeyError, ValueError):
            self.log(
                "Failed to extract limited stock info from %r." % response.url, WARNING
            )

        if reqs:
            return self.send_next_request(reqs, response)

        return product
Example #57
0
 def test_request_class(self):
     r = FormRequest("http://www.example.com")
     self._assert_serializes_ok(r, spider=self.spider)
     r = CustomRequest("http://www.example.com")
     self._assert_serializes_ok(r, spider=self.spider)
Example #58
0
    def get_login_page(self, response):
        """Function to get request options to login.
        Used to get ReCaptcha token; image captcha value."""

        # get the Captcha's options
        sitekey = response.selector.xpath(
            "//div[@class='g-recaptcha']/@data-sitekey").get("")
        imgcaptcha = response.selector.xpath(
            "//img[@id='imgCaptcha']/@src").get("")
        img_url = "https://meuveiculo.prefeitura.sp.gov.br" + \
            imgcaptcha.replace("..", "")

        # get cookies to download captcha image
        cookies = response.headers.getlist('Set-Cookie')
        c = SimpleCookie()
        for cookie in cookies:
            c.load(cookie.decode("utf-8"))
        cookies_list = [{"name": key, "value": c[key].value} for key in c]

        # set cookies to current session
        session = requests.Session()
        for cookie in cookies_list:
            # print(cookie)
            session.cookies.set(**cookie)

        # save captcha image
        r = session.get(img_url, stream=True)
        with open("captcha.jpg", 'wb') as f:
            f.write(r.content)

        imgcaptcha_txt, gcaptcha_txt = self.solve_captcha(
            sitekey, response.url)
        if not imgcaptcha_txt or not gcaptcha_txt:
            return

        # Get options for request
        EVENTTARGET = response.selector.xpath(
            "//input[@id='__EVENTTARGET']/@value").get("")
        EVENTARGUMENT = response.selector.xpath(
            "//input[@id='__EVENTARGUMENT']/@value").get("")
        LASTFOCUS = response.selector.xpath(
            "//input[@id='__LASTFOCUS']/@value").get("")
        PageProdamSPOnChange = response.selector.xpath(
            "//input[@id='PageProdamSPOnChange']/@value").get("")
        PageProdamSPPosicao = response.selector.xpath(
            "//input[@id='PageProdamSPPosicao']/@value").get("")
        PageProdamSPFocado = response.selector.xpath(
            "//input[@id='PageProdamSPFocado']/@value").get("")
        VIEWSTATE = response.selector.xpath(
            "//input[@id='__VIEWSTATE']/@value").get("")
        VIEWSTATEGENERATOR = response.selector.xpath(
            "//input[@id='__VIEWSTATEGENERATOR']/@value").get("")
        EVENTVALIDATION = response.selector.xpath(
            "//input[@id='__EVENTVALIDATION']/@value").get("")
        tpAudio = response.selector.xpath(
            "//input[@id='__tpAudio']/@value").get("")
        strVal = response.selector.xpath("//input[@id='__strVal']/@value").get(
            "")

        frm_data = {
            '__EVENTTARGET': EVENTTARGET,
            '__EVENTARGUMENT': EVENTARGUMENT,
            '__LASTFOCUS': LASTFOCUS,
            'PageProdamSPOnChange': PageProdamSPOnChange,
            'PageProdamSPPosicao': PageProdamSPPosicao,
            'PageProdamSPFocado': PageProdamSPFocado,
            '__VIEWSTATE': VIEWSTATE,
            '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
            '__EVENTVALIDATION': EVENTVALIDATION,
            'txtRenavam': self.renavam,
            'txtplaca': self.placa,
            '__tpAudio': tpAudio,
            '__strVal': strVal,
            'txtValidacao': imgcaptcha_txt,
            'g-recaptcha-response': gcaptcha_txt,
            'btnMultas': 'Consultar'
        }

        login_url = "https://meuveiculo.prefeitura.sp.gov.br/forms/frmPesquisarRenavam.aspx"
        yield FormRequest(url=login_url,
                          formdata=frm_data,
                          callback=self.login_me,
                          errback=self.errback_func,
                          dont_filter=True)
Example #59
0
    def parse(self, response):

        item = BilibiliItem()
        content = json.loads(response.body)
        data = content['data']

        try:
            item['status'] = content['status'] if 'status' in data.keys(
            ) else 'False'
            item['mid'] = data['mid']
            item['name'] = data['name']
            item['sex'] = data['sex']
            item['rank'] = data['rank']
            item['face'] = data['face']
            try:
                item['regtime'] = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime(data['regtime']))
            except:
                item['regtime'] = 'miss'
            item['spacesta'] = data['spacesta']
            item['birthday'] = data['birthday'] if 'birthday' in data.keys(
            ) else 'miss'
            item['sign'] = data['sign']
            item['level'] = data['level_info']['current_level']
            item['officialverify_type'] = data['official_verify']['type']
            item['officialverify_desc'] = data['official_verify']['desc']
            item['viptype'] = data['vip']['vipType']
            item['vipstatus'] = data['vip']['vipStatus']
            item['toutu'] = data['toutu']
            item['toutuid'] = data['toutuId']
            item['coins'] = data['coins']
            print('successful1 get userinfo:' + str(data['mid']))
        except Exception as e:
            print('error1:', item['mid'], e)

        try:
            header = {
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding':
                'gzip, deflate, br',
                'Accept-Language':
                'zh-CN,zh;q=0.9',
                'Cache-Control':
                'max-age=0',
                'Connection':
                'keep-alive',
                'Cookie':
                'sid=kh3nfx8z; UM_distinctid=160f901ca0540-077f672036c946-3c604504-130980-160f901ca061d; buvid3=18869AAC-BC92-43DF-8929-333117E24C5231000infoc; fts=1516006134; LIVE_BUVID=AUTO5115160061339503; pgv_pvi=7086867456; rpdid=oqxiqwmllodosomkwxoxw; finger=edc6ecda',
                'Host':
                'api.bilibili.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
            }
            url1 = 'https://api.bilibili.com/x/relation/stat?vmid=%d&jsonp=jsonp' % int(
                data['mid'])
            content1 = json.loads(requests.get(url=url1, headers=header).text)
            item['following'] = content1['data']['following']
            item['follower'] = content1['data']['follower']
            url2 = 'https://api.bilibili.com/x/space/upstat?mid=%d&jsonp=jsonp' % int(
                data['mid'])
            content2 = json.loads(requests.get(url=url2, headers=header).text)
            item['archiveview'] = content2['data']['archive']['view']
            item['article'] = content2['data']['article']['view']
            print('successful2 get userinfo:' + str(data['mid']))
        except Exception as e:
            item['following'] = 0
            item['follower'] = 0
            item['archiveview'] = 0
            item['article'] = 0
            print('miss2:', item['mid'], e)

        yield item
        for i in range(2, 100):
            form_data = {
                'mid': str(i),
                'csrf': '',
            }
            yield FormRequest(url=self.url,
                              callback=self.parse,
                              headers=self.head,
                              formdata=form_data)
Example #60
0
 def submit(self, response):
     yield FormRequest.from_response(response,
                                     formdata={'btnSubmit', '选课提交'},
                                     dont_filter=True)