Ejemplos de Request en Python, ejemplos de scrapy.spider.Request en Python

Ejemplo n.º 1

0

Mostrar archivo

    def parse(self, response):
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern,response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)
        status = json_data['status']
        try:
            is_retweeted = status['retweeted_status']
            mblog_id = is_retweeted['id']
            mblog_url = base_mblog_url % (mblog_id)
            return Request(mblog_url, callback=self.getBlog)
        except:
            pass
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items=self.putItem(status)
        usrdetail_url = base_usrdetail_url % (items['usr_id'])
        yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)

        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'],str(1))
            yield Request(reposts_url,callback=self.getReposters)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: toutiaospider.py Proyecto: wzzzzz/NoneSense-Toutiao

    def parse_comment(self, response):
        comments_json = json.loads(response.body_as_unicode())
        if comments_json['message'] == 'success':
            comments = comments_json['data']['comments']
            if len(comments) > 0:
                items = []
                for comment in comments:
                    item = CommentInfo({
                        'comment': comment['text'],
                        'likes': comment['digg_count'],
                        'time': comment['create_time'],
                        'comment_id': comment['id']
                    })
                    self.copy_article_info(response.meta, item)

                    if comment['reply_count'] > 0:
                        reply_to_comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str(
                            comment['id']) + '&dongtai_id=' + str(
                                comment['dongtai_id']
                            ) + '&offset=0&count=' + str(
                                comment['reply_count'])
                        reply_request = Request(reply_to_comment_url,
                                                callback=self.parse_reply,
                                                method='GET')
                        self.copy_article_info(response.meta,
                                               reply_request.meta)
                        reply_request.meta['reply_to_id'] = comment['id']
                        yield reply_request

                    items.append(item)
                return items
        else:
            return

Ejemplo n.º 3

0

Mostrar archivo

Archivo: points_spider.py Proyecto: oceancloud82/scraping

    def start_requests(self):
        for row in self.tyre_sizes:
            if self.check_row_is_processed(row):
                continue

            self.add_row_to_history(row)

            meta = {'row': row}
            xl = ''
            if row['XL'] == 'XL':
                xl = 'Y'
                meta['xl'] = True

            run_flat = ''
            if row['Run Flat'] == 'RF':
                run_flat = 'Y'
                meta['run_flat'] = True

            url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[
                    'Rim'] + '&speed=' + row[
                        'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
            yield Request(url, dont_filter=True, meta=meta)

            if row['Alt Speed']:
                url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                    'Width'] + '&profile=' + row[
                        'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[
                            'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
                yield Request(url, dont_filter=True, meta=meta)

Ejemplo n.º 4

0

Mostrar archivo

 def parse(self, response):
     """docstring for parse"""
     yield Request('http://www.ag.senate.gov/hearings', self.parse_data)
     for links in range(2, 10):
         yield Request(
             'http://www.ag.senate.gov/hearings?PageNum_rs=' + str(links),
             self.parse_data)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: crawl_gettime.py Proyecto: sophistz/WeiboDataAnalysis

    def getReposters(self, response):

        pattern = 'page=(\d+)'
        result = re.search(pattern, response.url)
        page_id = result.group(1)
        try:
            json_data = json.loads(response.text)
            data = json_data['data']
            reposts_data = data['data']

            if int(page_id) == 1:
                self.max = data['max']

            for item in reposts_data:
                items = self.putItem(item)
                time_url = base_mblog_url % (items['mblog_id'])
                yield Request(url=time_url,
                              meta={"item": items},
                              callback=self.get_accurate_time)
        except:
            pass

        if int(page_id) < int(self.max):
            reposts_url = re.sub(pattern, 'page=' + str(int(page_id) + 1),
                                 response.url)
            yield Request(reposts_url, callback=self.getReposters)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: banner_spider.py Proyecto: htayeb/banner-scanner

    def parse_courses(self,response):
        """
        Parses the classlist for a specific subject and term.
        
        Selects the links for each class details page, and uses them to fill in
        the 'url', 'number', 'subject', and 'title' fields for each class
        item. Generates requests to each link in order to get pre-requisites, 
        These requests are handled by the parse_details callback.
        """
        hxs = HtmlXPathSelector(response)
        courses = hxs.select('//td[@class="nttitle"]/a')
        for c in courses:
            item = CatalogItem()

            url = c.select('@href').extract()[0]
            data = dict(e.split('=') for e in url.split('?')[1].split('&'))
            title = c.select('text()').extract()[0].split(' - ')[1].strip()
            item['url'] = url
            item['number'] = data['crse_numb_in']
            item['subject'] = data['subj_code_in']
            item['title'] = title
            request = Request("https://www.uvic.ca" + url, callback=self.parse_details)
            request.meta['item'] = item

            yield request

Ejemplo n.º 7

0

Mostrar archivo

Archivo: legcosite.py Proyecto: huokedu/legco-watch

    def parse(self, response):
        sel = Selector(response)

        # Pages from 1998 onwards, new format
        # These normally cover around a 2-6 year period
        proceedings_menu = sel.xpath(
            '//a[starts-with(text(),"Official Record of Proceedings")]/@href')
        if proceedings_menu:
            for url in proceedings_menu.extract():
                absolute_url = urlparse.urljoin(response.url, url.strip())
                req = Request(absolute_url,
                              callback=self.parse_hansard_index_page)
                yield req

        # Former Legislative Council (before 7/1997)
        table = sel.xpath(
            "//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]"
        )
        if table:
            links = table[0].xpath(
                ".//td/a[contains(text(),'Session')]/@href").extract()
            if links:
                for url in links:
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    req = Request(absolute_url,
                                  callback=self.parse_hansard_index_page)
                    yield req

Ejemplo n.º 8

0

Mostrar archivo

    def parse(self, response):
        item = BaiduItem()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text
        subhead = soup.select('.lemmaWgt-lemmaTitle-title h2')
        if len(subhead) is not 0:
            # print(subhead[0].text)
            title = title + subhead[0].text
        item['title'] = title

        info_list = soup.select('.lemma-summary div')
        info = ''
        for temp in info_list:
            # 截取其中文字信息
            info += temp.text
            # 如果有超链接，则继续爬取
            a_list = temp.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['info'] = info

        properties_list = soup.select('.basicInfo-block dt')
        properties = ''
        for pro in properties_list:
            properties += '###' + pro.text.strip().replace('\n', '')
            # 如果有超链接，则继续爬取
            a_list = pro.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['properties'] = properties

        values_list = soup.select('.basicInfo-block dd')
        values = ''
        for val in values_list:
            values += '###' + val.text.strip().replace('\n', '')
            # 如果有超链接，则继续爬取
            a_list = val.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['values'] = values

        if len(soup.select('.summary-pic img')) is not 0:
            item['img'] = soup.select('.summary-pic img')[0]['src']

        print(item['title'])

        yield item

Ejemplo n.º 9

0

Mostrar archivo

Archivo: lekmer_dk.py Proyecto: oceancloud82/scraping

    def parse(self, response):
        for productxs in response.xpath(
                '//div[contains(@class, "products-list")]//div[@data-product]'
        ):
            yield Request(productxs.xpath(
                './/a[@class="product-card-link"]/@href').extract()[0],
                          callback=self.parse_product)

        next_page = response.xpath('//link[@rel="next"]/@href').extract()
        if next_page and not 'Page.Next.Link' in next_page[0]:
            yield Request(response.urljoin(next_page[0]))

Ejemplo n.º 10

0

Mostrar archivo

Archivo: myoumSpider.py Proyecto: dvliwei/pythone_web_page

    def parse_item(self, response):

        m2 = hashlib.md5()
        m2.update(response.url)

        youm = Myoum7Item()
        youm['url'] = response.url
        youm['page_name'] = m2.hexdigest()
        youm['do_main'] = 'm.youm7.com'


        referer = response.request.headers.get('Referer', None)

        reattr = re.findall(r"sectionID=(\w+)", referer)
        if reattr:
            youm['url'] = referer
            yield Request(referer, callback=self.parse_item)

        date_str = response.xpath('//div[@class="news-dev"]/@data-id')

        attr = re.findall(r"sectionID=(\w+)", response.url)

        if date_str and attr:
            sectionID = attr[0]
            date_str = date_str.extract()
            url_date = date_str[len(date_str)-1]
            newUrl = "https://m.youm7.com/Section/NewsSectionPaging?lastid="+url_date+"&sectionID="+str(sectionID)
            youm['url'] = newUrl
            yield Request(newUrl, callback=self.parse_item)

        title_str = response.xpath('//title/text()')

        content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')

        type_str =response.xpath('//div[@class="container"]//div[@class="breadcumb"]//a/text()')#菜单中的分类

        if content_str and title_str:
            content = ""
            content_str =  response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')
            for s in content_str.extract():
                content += s

            youm['title'] = title_str.extract()[0]
            youm['content'] = content
            youm['str_size'] = len(content)
            youm['type'] = type_str.extract()[1] #取详细页中的分类



        yield youm

Ejemplo n.º 11

0

Mostrar archivo

Archivo: banner_spider.py Proyecto: htayeb/banner-scanner

    def parse_schedule(self,response):
        """
        Parse the schedule for a subject and term 

        Grab all the links for each section to get CRNs and enrollment info               
        """
        hxs = HtmlXPathSelector(response)
        links = hxs.select('//th[@class="ddtitle"]/a/@href').extract()

        for l in links:
            url = 'https://www.uvic.ca'+l
            request = Request(url,callback=self.parse_section)      
            request.meta['item'] = response.meta['item']
            yield request

Ejemplo n.º 12

0

Mostrar archivo

Archivo: banner_spider.py Proyecto: htayeb/banner-scanner

    def parse(self,response):
        """
        Parses only the first page of the dynamic class catalog. 
        
        Extracts the available terms from the select box and generates 
        requests for the search pages for each term. These requests are handled
        by the parse_term method
        """
        hxs = HtmlXPathSelector(response)
        
        # get term dates from the options in a select box
        terms = hxs.select('//select[@id="term_input_id"]/child::option').select('@value').extract() 
        
        # eliminate the entry corresponding to None, and remove terms that are
        # too old
        def is_valid_term(term):
            current_year = datetime.now().year
            if len(term) == 0:
                return False
            elif (current_year - int(term[0:4])) > 4:
                return False
            else:
                return True
                
        terms = [t for t in terms if is_valid_term(t)]
        
        self.log('Got terms: '+str(terms))        
        self.terms = terms
        
        # get the complete class listings
        url = self.classlist_url_template.format(term = self.terms[0])
        request = Request(url,callback=self.parse_classlist_search)
        yield request
        
        # get the schedule pages for each term and subject
#        for term in self.terms:
#            for subject in self.subjects:
#                url = self.schedule_url_template.format(term=term,subject=subject,number='')
#                item = ScheduleItem()
#                item['term'] = term
#                item['subject'] = subject
#                request = Request(url,callback=self.parse_schedule)
#                request.meta['item'] = item
#                yield request

        for term in terms:
            term_url = 'https://www.uvic.ca/BAN2P/bwckgens.p_proc_term_date?p_calling_proc=bwckschd.p_disp_dyn_sched&p_term='+term
            request = Request(term_url,callback=self.parse_term)
            request.meta['term'] = term
            yield request

Ejemplo n.º 13

0

Mostrar archivo

 def getReposters(self,response):
     json_data = json.loads(response.text)
     data = json_data['data']
     max = data['max']
     reposts_data = data['data']
     pattern = 'page=(\d+)'
     result = re.search(pattern,response.url)
     page_id = result.group(1)
     for item in reposts_data:
         items=self.putItem(item)
         usrdetail_url = base_usrdetail_url % (items['usr_id'])
         yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)
     if int(page_id) < int(max):
         reposts_url = re.sub(pattern,'page='+str(int(page_id)+1),response.url)
         yield Request(reposts_url,callback=self.getReposters)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: education_law.py Proyecto: jianlinlili/b

    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("div.list_body_box1 .art_tit a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)
            # 提取下一页并交给scrapy进行下载

            # ！！！！！！！！！！！！！！！！！提取下一页未完成
            next_url = response.css('div.list_body_box1 .pagingNormal a')
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: ICOBenchSpider.py Proyecto: diudiu/ido_spider

    def parse(self, response):
        self.log("正在解析第{}页".format(self.current_page))

        no_data = response.xpath(".//div[@class='ico_list']/div[@class='no_data']")
        if no_data or self.current_page > self.max_page:
            self.log("no data = {}".format(no_data))
            self.log("没有数据或超过指定页，爬虫退出！最大爬取页为:{}".format(self.max_page))
            return

        uris = response.xpath(".//div[@class='content']/a/@href").extract()
        for uri in uris:
            yield Request(self.domains + uri, self.parse_detail)

        self.current_page += 1
        yield Request(self.base_url.format(self.current_page), self.parse)

Ejemplo n.º 16

0

Mostrar archivo

    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("ul.listbox li a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "span.next_page a::attr(href)").extract()[0]
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)

Ejemplo n.º 17

0

Mostrar archivo

 def parse(self, response):
     original_key = response.url.split('/')[-1]
     key = urllib.unquote(original_key.split('?')[0])
     texts = response.xpath('//tbody/tr/td/text()').extract()
     filename = os.getenv('RESULT_PATH')
     texts = [t.encode('utf-8') for t in texts if '\n' not in t]
     merged_texts = []
     for i in xrange(0, len(texts)):
         index = i % 2
         if index == 0:
             merged_texts.append(texts[i] + texts[i + 1] + '\n')
     # print 'lines num:', len(merged_texts)
     # not_200_path = os.getenv('NOT_200')
     # if response.status != 200:
     #     with open(key+'\t'+str(len(set(merged_texts)))+'\n')
     legacy_file_path = os.getenv('LEGACY_PATH')
     if len(merged_texts) == 100:
         with open(legacy_file_path, 'a') as legacy_file:
             legacy_file.write(key + '\n')
     with open(filename, 'a') as f:
         f.write(key + '\t' + str(len(set(merged_texts))) + '\n')
     if len(merged_texts) > 0:
         detail_urls = response.xpath('//tbody/tr/td/a/@href').extract()
         for d in detail_urls:
             print "detail url is %s \n" % d
             yield Request(url='http://bcc.blcu.edu.cn{0}'.format(d),
                           meta={
                               'dont_filter': True,
                               'dont_merge_cookies': True
                           },
                           callback=self.parse_detail)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: Wode.py Proyecto: xiaocatwy/python-spider

    def parse(self, response):
        item = {}
        imgurl = response.xpath(
            "//div[@id='waterfall']/div[@class='pin wfc wft']/a/img/@src"
        ).extract()
        for i in range(len(imgurl)):
            item["name"] = self.title
            item["imgurl"] = imgurl[i]

            item["imgherf"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/a/@href"
            ).extract()[i]

            item["imgvisit"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='repin']/text()"
            ).extract()[i]
            try:
                item["imglike"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='like']/text()"
                ).extract()[i]
            except Exception as e:
                item["imglike"] = "0"
            try:
                item["imgdiscrit"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p[@class='description']/text()"
                ).extract()[i]
            except Exception as e:
                item["imgdiscrit"] = ""
            yield item
        for i in range(4):
            yield Request(url=response.url,
                          callback=self.next,
                          meta={"page": "2"},
                          dont_filter=True)

Ejemplo n.º 19

0

Mostrar archivo

 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath(
         '/html/body/div/div[3]/@totalcount').extract()[0]
     totalpage = int(int(totalcount) / 15 + 1)
     for i in range(1, totalpage + 1):
         url = response.url + '?page.pageNo=' + str(i)
         yield Request(url, callback=self.parse_page)

Ejemplo n.º 20

0

Mostrar archivo

 def parse(self, response):
     data = json.loads(response.body)['result']['products'].values()
     for i in data:
         item = Book_Product()
         item['title'] = i['title']
         item['subtitle'] = i['subtitle']
         item['uid'] = i['permanentProductPageUrl'].split('/')[-1].split('?')[0]
         item['fsp'] = i['fsp']
         item['mrp'] = i['mrp']
         self.items.append(item)
     print len(self.items)
     self.count += 10
     if self.count > 100:
         # < Python 3.3 doesn't allows mixing of return and yield statements in same function.
         # So, we yield another method self.return_data which then returns the result.
         yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.return_data)
     yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.parse)

Ejemplo n.º 21

0

Mostrar archivo

 def parse_link(self, response):
     based_url = "http://blog.csdn.net"
     soup = BeautifulSoup(response.body, 'html.parser')
     blog = soup.find_all("div", "list_item article_item")
     for item in blog:
         # print item.find("span", "link_title").find("a").get("href"), item.find("span", "link_title").find("a").get_text()
         href = based_url + item.find("span",
                                      "link_title").find("a").get("href")
         yield Request(href, callback=self.parse_get_blog_title)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: qyxg_xzcf_credit_taiyuan.py Proyecto: ZhouForrest/Spider

 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath('/html/body/script').re('pageCount": .*,')[0]
     pages = int(re.findall('.*(.\d).*', totalcount)[0])
     for i in range(1, pages + 1):
         if i == 1:
             url = response.url
         url = self.page_url.format(str(i))
         yield Request(url, callback=self.parse_page)

Ejemplo n.º 23

0

Mostrar archivo

 def parse_products(self, response):
     brand_name = ''.join(response.xpath(
         '//p[contains(@class, "category-image")]/img/@title').extract())
     products = response.xpath(
         '//ul[contains(@class, "products-grid")]//*[@class="product-name"]/a/@href').extract()
     for url in products:
         yield Request(response.urljoin(url),
                       callback=self.parse_product,
                       meta={'brand': brand_name})

Ejemplo n.º 24

0

Mostrar archivo

Archivo: spider.py Proyecto: ritksm/barrow

    def parse(self, response):
        """ parse first response
        """
        if self.spider_config['url_type'] == 'list_page':
            sel = Selector(response)
            box = sel.xpath(self.spider_config['list_xpath'])
            for x in box:
                item = DynamicItem(self.spider_config['item'])
                for key, value in self.spider_config['xpath']['keys'].iteritems():
                    result = x.xpath(value).extract()
                    if len(result) == 1:
                        # single value
                        item[key] = result[0]
                    else:
                        item[key] = result

                # construct follow request if configured
                if self.spider_config['xpath']['follow'] is not None:
                    # more to follow
                    follow_config = self.spider_config['xpath']['follow']
                    if len(follow_config['follow_info']['url'].keys()) >= 2:
                        # needs string formation
                        arguments = dict()
                        for key, value in follow_config['follow_info']['url'].iteritems():
                            # construct arguments
                            if not key == 'base_url':
                                arguments[key] = item[value]
                        url = follow_config['follow_info']['url']['base_url'].format(**arguments)
                    else:
                        url = follow_config['follow_info']['url']['base_url']

                    request = Request(url, callback=self.parse_follow)
                    request.meta['item'] = item
                    request.meta['config'] = follow_config

                    yield request
                else:
                    # no follow request, so save the item
                    get_model('barrow', 'SpiderResult').objects.add_result(spider_task=self.spider_task,
                                                                           item=item,
                                                                           unique=self.spider_config['unique_result'],
                                                                           unique_keys=self.spider_config[
                                                                               'unique_keys'])

Ejemplo n.º 25

0

Mostrar archivo

Archivo: math_spider.py Proyecto: htayeb/banner-scanner

    def parse(self, response):
        # items = []
        hxs = HtmlXPathSelector(response)
        courses = hxs.select('//td[@class="nttitle"]/a')
        for c in courses:
            item = Course()

            url = c.select("@href").extract()[0]
            data = dict(e.split("=") for e in url.split("?")[1].split("&"))
            desc = c.select("text()").extract()[0].split(" - ")[1].strip()
            item["url"] = url
            item["number"] = data["crse_numb_in"]
            item["department"] = data["subj_code_in"]
            item["desc"] = desc

            request = Request("https://www.uvic.ca" + url, callback=self.parse_details)
            request.meta["item"] = item

            yield request

Ejemplo n.º 26

0

Mostrar archivo

    def parse_city_info(self, response):

        ct = Selector(response)

        # 获取总页数
        total_pages = ct.xpath(
            '//*[@id="citylistpagination"]/div/a[7]/@data-page').extract()[0]

        for page in range(1, int(total_pages) + 1):
            yield Request(self.cities_url.format(page=page),
                          callback=self.parse)

Ejemplo n.º 27

0

Mostrar archivo

Archivo: OnePointThreeAcresSpider.py Proyecto: TuringMacLee/Master-Application-Helper

    def parse(self, response):
        pageinformation = response.xpath('//*[@id="threadlisttableid"]')
        hxs = HtmlXPathSelector(response)
        march_re = r'">\s*(.*)\<'

        #for eachstudent in pageinformation:
        item = AdmissionInformation()
        item['admission_time'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[1]').re(
                r'">\s*(.*)\<')
        item['gre'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[3]').re(
                r': \s*(.*)\</font>')
        item['gpa'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[5]').re(
                r'">\s*(.*)\<')
        item['undergrad_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[6]').re(
                r'>(.*)\</font>')
        item['major'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[4]').re(
                r'color="green">\s*(.*)\<')
        item['english_grade'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[2]').re(
                r'>:\s*(.*)\<')
        item['year'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[1]').re(
                r'\[(.*)\<')
        item['admission_type'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[2]').re(
                r'">\s*(.*)\<')
        item['admission_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[5]').re(
                r'">\s*(.*)\<')
        item['admission_major'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[4]/b').re(
                r'<b>\s*(.*)\<')
        item['title'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/a[2]/text()').extract()
        item['status'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[3]/b').re(
                '<b>\s*(.*)\<')
        links = hxs.xpath('//*[contains(@id,"normalthread")]/tr/th/a[2]').re(
            r'href\="([^\"]*)\"')
        urls_real = []
        for each in links:
            urls_real.append(each.replace('&amp;', '&'))
            #print('url is:' + each.replace('&amp;','&'))
        item['link'] = urls_real

        yield item
        next_url = self.get_next_url(response.url)
        if next_url != None:
            yield Request(next_url)

Ejemplo n.º 28

0

Mostrar archivo

 def parse(self, response):
     """通过 xpath 获取热门电子书的链接"""
     sel = Selector(response)
     sites = sel.xpath(
         '//div[@class="section ebook-area"]//ul[@class="list-col list-col5"]/li//div[@class="title"]'
     )
     for site in sites:
         title = site.xpath('a/@title').extract()
         link = site.xpath('a/@href').extract()
         title, link = title[0], link[0]
         # print title, link
         yield Request(url=link, callback=self.parse2)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: mizi.py Proyecto: wang-love-yu/mzitu_scrapy

 def parse_item(self, response):
     #print(response.url)
     item = MzituScrapyItem()
     item['url'] = response.url
     title = response.xpath('//h2[@class="main-title"]/text()').extract()[0]
     item['name'] = title
     max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0]
     for i in range(1,int(max_num)):
         page_url = response.url+"/"+str(i)
         yield Request(page_url,callback= self.get_image_url)
     item['image_urls'] = self.img_urls
     yield item

Ejemplo n.º 30

0

Mostrar archivo

 def parse(self, response):
     based_url = "http://blog.csdn.net"
     list_result = ["http://blog.csdn.net/Temanm/article/list/1"]
     soup = BeautifulSoup(response.body, 'html.parser')
     pages = soup.find("div",
                       "list_item_new").find("div",
                                             "pagelist").find_all("a")
     for i in range(len(pages)):
         href = based_url + pages[i].get("href")
         if href not in list_result:
             list_result.append(href)
     for link in list_result:
         yield Request(link, callback=self.parse_link)

Ejemplo n.º 31

0

Mostrar archivo

Archivo: education_law.py Proyecto: jianlinlili/b

    def parse(self, response):
        # 当前页面所用的url
        post_nodes = ((response.css("div.div_list .div_item .div_title a")) or
                      (response.css("div.st_div .div_item .div_itemtitle a")))

        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "div.myp2c_div_paging  ::attr(href)").extract_first("")
            #（未实现）  解析javasrip下一页，本处是自动提取下一个也页面的关键
            #     javascript:fn_loaditems_id_6a4e96a3_7f4b_46f4_b383_5c6b27673ec3(2)'
            # t _url = response.css("div.myp2c_div_paging  ::attr(href)").extract_first("")
            # next_url = document.execCommand(t_url)

            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: crawl_gettime.py Proyecto: sophistz/WeiboDataAnalysis

    def parse(self, response):

        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items = self.putItem(status)
        yield items
        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'], str(1))
            yield Request(reposts_url, callback=self.getReposters)

Ejemplo n.º 33

0

Mostrar archivo

 def start_requests(self):
     keys = self._get_search_keys()
     for k in keys:
         yield Request(
             url='http://bcc.blcu.edu.cn/zh/search/0/{0}'.format(
                 urllib.quote(k)),
             meta={
                 'dont_filter': True,
                 'dont_merge_cookies': True
             }
             # 'dont_redirect': True,
             # 'handle_httpstatus_list': [302]}
         )

Ejemplo n.º 34

0

Mostrar archivo

Archivo: banner_spider.py Proyecto: htayeb/banner-scanner

 def parse_details(self, response):
     """
     Parse class prerequisites.
     """
     hxs = HtmlXPathSelector(response)
     prereqs = hxs.select("//span[text()='Faculty']/following-sibling::text() | //span[text()='Faculty']/following-sibling::a")
     self.log("parsing "+response.url,level=DEBUG)
     self.log('prereqs = '+str(prereqs),level = DEBUG)
     prereqs = self.parse_prereqs(prereqs)
     self.log('parsed prereqs = '+str(prereqs),level = DEBUG)
     
     item = response.meta['item']
     item['prereqs'] = prereqs
     
     yield item
     
     calendar_url = self.calendar_url_template.format(subject=item['subject'],
                                                      number=item['number'])
     request = Request(calendar_url,callback=self.parse_calendar)
     request.meta['handle_httpstatus_list'] = [404]
     request.meta['subject'] = item['subject']
     request.meta['number'] = item['number']
     yield request

Ejemplo n.º 35

0

Mostrar archivo

Archivo: banner_spider.py Proyecto: htayeb/banner-scanner

    def parse_term(self,response):
        """
        Parses the schedule search page for a particular term.
        
        Extracts the subject list from the first select box and generates
        requests for the schedule for each subject in the current term. These
        requests are handled by the parse_schedule callback.
        """
        hxs = HtmlXPathSelector(response)
        if not TEST_RUN:
            subjects = hxs.select('//select[@id="subj_id"]/child::option').select('@value').extract()
        else:
            subjects = TEST_SUBJECTS
        term = response.meta['term']
#        self.log('Got subjects; '+str(subjects))        
        
        for subj in subjects:
            url = self.schedule_url_template.format(term=term,subject=subj,number='')
            item = ScheduleItem()
            item['term'] = term
            item['subject'] = subj
            request = Request(url,callback=self.parse_schedule)
            request.meta['item'] = item
            yield request

Ejemplo n.º 36

0

Mostrar archivo

    def parse_city(self, response):

        ct = Selector(response)

        # 获取攻略页链接
        gonglve_link = ct.xpath('//*[@class="navbar-btn"]/@href').extract()[0]
        # 获取城市名
        city_name = response.meta.get('name')

        yield Request(self.domains_url + gonglve_link,
                      callback=self.gong_lve,
                      meta={
                          'name': city_name,
                          'href': gonglve_link
                      })

Ejemplo n.º 37

0

Mostrar archivo

Archivo: lianjia_spider.py Proyecto: diaodou/Scrapy_items

 def parse(self, response):
     item = LianjiafItem()
     data = BeautifulSoup(response.text, 'lxml').find_all('li',
                                                          class_='clear')
     for tag in data:
         page_url = response.url
         title = tag.find('div', class_='title').get_text()
         url = tag.div.find('a', attrs={'data-el': 'ershoufang'})['href']
         type = tag.find('div', class_='houseInfo').get_text()
         price = tag.find('div',
                          class_='totalPrice').get_text().replace('万', '')
         for field in item.fields:
             item[field] = eval(field)
         yield item
     page = response.xpath('//div[@comp-module="page"]').re(
         'lPage\"\:(\d+)')[0]
     for u in range(1, int(page) + 1):
         urls = 'https://bj.lianjia.com/ershoufang/pg{}'.format(u)
         yield Request(urls, callback=self.parse)