Ejemplo n.º 1
0
    def parse(self, response):
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern,response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)
        status = json_data['status']
        try:
            is_retweeted = status['retweeted_status']
            mblog_id = is_retweeted['id']
            mblog_url = base_mblog_url % (mblog_id)
            return Request(mblog_url, callback=self.getBlog)
        except:
            pass
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items=self.putItem(status)
        usrdetail_url = base_usrdetail_url % (items['usr_id'])
        yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)

        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'],str(1))
            yield Request(reposts_url,callback=self.getReposters)
Ejemplo n.º 2
0
    def parse_comment(self, response):
        comments_json = json.loads(response.body_as_unicode())
        if comments_json['message'] == 'success':
            comments = comments_json['data']['comments']
            if len(comments) > 0:
                items = []
                for comment in comments:
                    item = CommentInfo({
                        'comment': comment['text'],
                        'likes': comment['digg_count'],
                        'time': comment['create_time'],
                        'comment_id': comment['id']
                    })
                    self.copy_article_info(response.meta, item)

                    if comment['reply_count'] > 0:
                        reply_to_comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str(
                            comment['id']) + '&dongtai_id=' + str(
                                comment['dongtai_id']
                            ) + '&offset=0&count=' + str(
                                comment['reply_count'])
                        reply_request = Request(reply_to_comment_url,
                                                callback=self.parse_reply,
                                                method='GET')
                        self.copy_article_info(response.meta,
                                               reply_request.meta)
                        reply_request.meta['reply_to_id'] = comment['id']
                        yield reply_request

                    items.append(item)
                return items
        else:
            return
Ejemplo n.º 3
0
    def start_requests(self):
        for row in self.tyre_sizes:
            if self.check_row_is_processed(row):
                continue

            self.add_row_to_history(row)

            meta = {'row': row}
            xl = ''
            if row['XL'] == 'XL':
                xl = 'Y'
                meta['xl'] = True

            run_flat = ''
            if row['Run Flat'] == 'RF':
                run_flat = 'Y'
                meta['run_flat'] = True

            url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[
                    'Rim'] + '&speed=' + row[
                        'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
            yield Request(url, dont_filter=True, meta=meta)

            if row['Alt Speed']:
                url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                    'Width'] + '&profile=' + row[
                        'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[
                            'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
                yield Request(url, dont_filter=True, meta=meta)
Ejemplo n.º 4
0
 def parse(self, response):
     """docstring for parse"""
     yield Request('http://www.ag.senate.gov/hearings', self.parse_data)
     for links in range(2, 10):
         yield Request(
             'http://www.ag.senate.gov/hearings?PageNum_rs=' + str(links),
             self.parse_data)
Ejemplo n.º 5
0
    def getReposters(self, response):

        pattern = 'page=(\d+)'
        result = re.search(pattern, response.url)
        page_id = result.group(1)
        try:
            json_data = json.loads(response.text)
            data = json_data['data']
            reposts_data = data['data']

            if int(page_id) == 1:
                self.max = data['max']

            for item in reposts_data:
                items = self.putItem(item)
                time_url = base_mblog_url % (items['mblog_id'])
                yield Request(url=time_url,
                              meta={"item": items},
                              callback=self.get_accurate_time)
        except:
            pass

        if int(page_id) < int(self.max):
            reposts_url = re.sub(pattern, 'page=' + str(int(page_id) + 1),
                                 response.url)
            yield Request(reposts_url, callback=self.getReposters)
Ejemplo n.º 6
0
    def parse_courses(self,response):
        """
        Parses the classlist for a specific subject and term.
        
        Selects the links for each class details page, and uses them to fill in
        the 'url', 'number', 'subject', and 'title' fields for each class
        item. Generates requests to each link in order to get pre-requisites, 
        These requests are handled by the parse_details callback.
        """
        hxs = HtmlXPathSelector(response)
        courses = hxs.select('//td[@class="nttitle"]/a')
        for c in courses:
            item = CatalogItem()

            url = c.select('@href').extract()[0]
            data = dict(e.split('=') for e in url.split('?')[1].split('&'))
            title = c.select('text()').extract()[0].split(' - ')[1].strip()
            item['url'] = url
            item['number'] = data['crse_numb_in']
            item['subject'] = data['subj_code_in']
            item['title'] = title
            request = Request("https://www.uvic.ca" + url, callback=self.parse_details)
            request.meta['item'] = item

            yield request
Ejemplo n.º 7
0
    def parse(self, response):
        sel = Selector(response)

        # Pages from 1998 onwards, new format
        # These normally cover around a 2-6 year period
        proceedings_menu = sel.xpath(
            '//a[starts-with(text(),"Official Record of Proceedings")]/@href')
        if proceedings_menu:
            for url in proceedings_menu.extract():
                absolute_url = urlparse.urljoin(response.url, url.strip())
                req = Request(absolute_url,
                              callback=self.parse_hansard_index_page)
                yield req

        # Former Legislative Council (before 7/1997)
        table = sel.xpath(
            "//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]"
        )
        if table:
            links = table[0].xpath(
                ".//td/a[contains(text(),'Session')]/@href").extract()
            if links:
                for url in links:
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    req = Request(absolute_url,
                                  callback=self.parse_hansard_index_page)
                    yield req
Ejemplo n.º 8
0
    def parse(self, response):
        item = BaiduItem()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text
        subhead = soup.select('.lemmaWgt-lemmaTitle-title h2')
        if len(subhead) is not 0:
            # print(subhead[0].text)
            title = title + subhead[0].text
        item['title'] = title

        info_list = soup.select('.lemma-summary div')
        info = ''
        for temp in info_list:
            # 截取其中文字信息
            info += temp.text
            # 如果有超链接,则继续爬取
            a_list = temp.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['info'] = info

        properties_list = soup.select('.basicInfo-block dt')
        properties = ''
        for pro in properties_list:
            properties += '###' + pro.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = pro.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['properties'] = properties

        values_list = soup.select('.basicInfo-block dd')
        values = ''
        for val in values_list:
            values += '###' + val.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = val.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['values'] = values

        if len(soup.select('.summary-pic img')) is not 0:
            item['img'] = soup.select('.summary-pic img')[0]['src']

        print(item['title'])

        yield item
Ejemplo n.º 9
0
    def parse(self, response):
        for productxs in response.xpath(
                '//div[contains(@class, "products-list")]//div[@data-product]'
        ):
            yield Request(productxs.xpath(
                './/a[@class="product-card-link"]/@href').extract()[0],
                          callback=self.parse_product)

        next_page = response.xpath('//link[@rel="next"]/@href').extract()
        if next_page and not 'Page.Next.Link' in next_page[0]:
            yield Request(response.urljoin(next_page[0]))
Ejemplo n.º 10
0
    def parse_item(self, response):

        m2 = hashlib.md5()
        m2.update(response.url)

        youm = Myoum7Item()
        youm['url'] = response.url
        youm['page_name'] = m2.hexdigest()
        youm['do_main'] = 'm.youm7.com'


        referer = response.request.headers.get('Referer', None)

        reattr = re.findall(r"sectionID=(\w+)", referer)
        if reattr:
            youm['url'] = referer
            yield Request(referer, callback=self.parse_item)

        date_str = response.xpath('//div[@class="news-dev"]/@data-id')

        attr = re.findall(r"sectionID=(\w+)", response.url)

        if date_str and attr:
            sectionID = attr[0]
            date_str = date_str.extract()
            url_date = date_str[len(date_str)-1]
            newUrl = "https://m.youm7.com/Section/NewsSectionPaging?lastid="+url_date+"&sectionID="+str(sectionID)
            youm['url'] = newUrl
            yield Request(newUrl, callback=self.parse_item)

        title_str = response.xpath('//title/text()')

        content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')

        type_str =response.xpath('//div[@class="container"]//div[@class="breadcumb"]//a/text()')#菜单中的分类

        if content_str and title_str:
            content = ""
            content_str =  response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')
            for s in content_str.extract():
                content += s

            youm['title'] = title_str.extract()[0]
            youm['content'] = content
            youm['str_size'] = len(content)
            youm['type'] = type_str.extract()[1] #取详细页中的分类



        yield youm
Ejemplo n.º 11
0
    def parse_schedule(self,response):
        """
        Parse the schedule for a subject and term 

        Grab all the links for each section to get CRNs and enrollment info               
        """
        hxs = HtmlXPathSelector(response)
        links = hxs.select('//th[@class="ddtitle"]/a/@href').extract()

        for l in links:
            url = 'https://www.uvic.ca'+l
            request = Request(url,callback=self.parse_section)      
            request.meta['item'] = response.meta['item']
            yield request
Ejemplo n.º 12
0
    def parse(self,response):
        """
        Parses only the first page of the dynamic class catalog. 
        
        Extracts the available terms from the select box and generates 
        requests for the search pages for each term. These requests are handled
        by the parse_term method
        """
        hxs = HtmlXPathSelector(response)
        
        # get term dates from the options in a select box
        terms = hxs.select('//select[@id="term_input_id"]/child::option').select('@value').extract() 
        
        # eliminate the entry corresponding to None, and remove terms that are
        # too old
        def is_valid_term(term):
            current_year = datetime.now().year
            if len(term) == 0:
                return False
            elif (current_year - int(term[0:4])) > 4:
                return False
            else:
                return True
                
        terms = [t for t in terms if is_valid_term(t)]
        
        self.log('Got terms: '+str(terms))        
        self.terms = terms
        
        # get the complete class listings
        url = self.classlist_url_template.format(term = self.terms[0])
        request = Request(url,callback=self.parse_classlist_search)
        yield request
        
        # get the schedule pages for each term and subject
#        for term in self.terms:
#            for subject in self.subjects:
#                url = self.schedule_url_template.format(term=term,subject=subject,number='')
#                item = ScheduleItem()
#                item['term'] = term
#                item['subject'] = subject
#                request = Request(url,callback=self.parse_schedule)
#                request.meta['item'] = item
#                yield request

        for term in terms:
            term_url = 'https://www.uvic.ca/BAN2P/bwckgens.p_proc_term_date?p_calling_proc=bwckschd.p_disp_dyn_sched&p_term='+term
            request = Request(term_url,callback=self.parse_term)
            request.meta['term'] = term
            yield request
Ejemplo n.º 13
0
 def getReposters(self,response):
     json_data = json.loads(response.text)
     data = json_data['data']
     max = data['max']
     reposts_data = data['data']
     pattern = 'page=(\d+)'
     result = re.search(pattern,response.url)
     page_id = result.group(1)
     for item in reposts_data:
         items=self.putItem(item)
         usrdetail_url = base_usrdetail_url % (items['usr_id'])
         yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)
     if int(page_id) < int(max):
         reposts_url = re.sub(pattern,'page='+str(int(page_id)+1),response.url)
         yield Request(reposts_url,callback=self.getReposters)
Ejemplo n.º 14
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("div.list_body_box1 .art_tit a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)
            # 提取下一页并交给scrapy进行下载

            # !!!!!!!!!!!!!!!!!提取下一页未完成
            next_url = response.css('div.list_body_box1 .pagingNormal a')
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Ejemplo n.º 15
0
    def parse(self, response):
        self.log("正在解析第{}页".format(self.current_page))

        no_data = response.xpath(".//div[@class='ico_list']/div[@class='no_data']")
        if no_data or self.current_page > self.max_page:
            self.log("no data = {}".format(no_data))
            self.log("没有数据或超过指定页,爬虫退出!最大爬取页为:{}".format(self.max_page))
            return

        uris = response.xpath(".//div[@class='content']/a/@href").extract()
        for uri in uris:
            yield Request(self.domains + uri, self.parse_detail)

        self.current_page += 1
        yield Request(self.base_url.format(self.current_page), self.parse)
Ejemplo n.º 16
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("ul.listbox li a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "span.next_page a::attr(href)").extract()[0]
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Ejemplo n.º 17
0
 def parse(self, response):
     original_key = response.url.split('/')[-1]
     key = urllib.unquote(original_key.split('?')[0])
     texts = response.xpath('//tbody/tr/td/text()').extract()
     filename = os.getenv('RESULT_PATH')
     texts = [t.encode('utf-8') for t in texts if '\n' not in t]
     merged_texts = []
     for i in xrange(0, len(texts)):
         index = i % 2
         if index == 0:
             merged_texts.append(texts[i] + texts[i + 1] + '\n')
     # print 'lines num:', len(merged_texts)
     # not_200_path = os.getenv('NOT_200')
     # if response.status != 200:
     #     with open(key+'\t'+str(len(set(merged_texts)))+'\n')
     legacy_file_path = os.getenv('LEGACY_PATH')
     if len(merged_texts) == 100:
         with open(legacy_file_path, 'a') as legacy_file:
             legacy_file.write(key + '\n')
     with open(filename, 'a') as f:
         f.write(key + '\t' + str(len(set(merged_texts))) + '\n')
     if len(merged_texts) > 0:
         detail_urls = response.xpath('//tbody/tr/td/a/@href').extract()
         for d in detail_urls:
             print "detail url is %s \n" % d
             yield Request(url='http://bcc.blcu.edu.cn{0}'.format(d),
                           meta={
                               'dont_filter': True,
                               'dont_merge_cookies': True
                           },
                           callback=self.parse_detail)
Ejemplo n.º 18
0
    def parse(self, response):
        item = {}
        imgurl = response.xpath(
            "//div[@id='waterfall']/div[@class='pin wfc wft']/a/img/@src"
        ).extract()
        for i in range(len(imgurl)):
            item["name"] = self.title
            item["imgurl"] = imgurl[i]

            item["imgherf"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/a/@href"
            ).extract()[i]

            item["imgvisit"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='repin']/text()"
            ).extract()[i]
            try:
                item["imglike"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='like']/text()"
                ).extract()[i]
            except Exception as e:
                item["imglike"] = "0"
            try:
                item["imgdiscrit"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p[@class='description']/text()"
                ).extract()[i]
            except Exception as e:
                item["imgdiscrit"] = ""
            yield item
        for i in range(4):
            yield Request(url=response.url,
                          callback=self.next,
                          meta={"page": "2"},
                          dont_filter=True)
Ejemplo n.º 19
0
 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath(
         '/html/body/div/div[3]/@totalcount').extract()[0]
     totalpage = int(int(totalcount) / 15 + 1)
     for i in range(1, totalpage + 1):
         url = response.url + '?page.pageNo=' + str(i)
         yield Request(url, callback=self.parse_page)
Ejemplo n.º 20
0
 def parse(self, response):
     data = json.loads(response.body)['result']['products'].values()
     for i in data:
         item = Book_Product()
         item['title'] = i['title']
         item['subtitle'] = i['subtitle']
         item['uid'] = i['permanentProductPageUrl'].split('/')[-1].split('?')[0]
         item['fsp'] = i['fsp']
         item['mrp'] = i['mrp']
         self.items.append(item)
     print len(self.items)
     self.count += 10
     if self.count > 100:
         # < Python 3.3 doesn't allows mixing of return and yield statements in same function.
         # So, we yield another method self.return_data which then returns the result.
         yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.return_data)
     yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.parse)
Ejemplo n.º 21
0
 def parse_link(self, response):
     based_url = "http://blog.csdn.net"
     soup = BeautifulSoup(response.body, 'html.parser')
     blog = soup.find_all("div", "list_item article_item")
     for item in blog:
         # print item.find("span", "link_title").find("a").get("href"), item.find("span", "link_title").find("a").get_text()
         href = based_url + item.find("span",
                                      "link_title").find("a").get("href")
         yield Request(href, callback=self.parse_get_blog_title)
Ejemplo n.º 22
0
 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath('/html/body/script').re('pageCount": .*,')[0]
     pages = int(re.findall('.*(.\d).*', totalcount)[0])
     for i in range(1, pages + 1):
         if i == 1:
             url = response.url
         url = self.page_url.format(str(i))
         yield Request(url, callback=self.parse_page)
Ejemplo n.º 23
0
 def parse_products(self, response):
     brand_name = ''.join(response.xpath(
         '//p[contains(@class, "category-image")]/img/@title').extract())
     products = response.xpath(
         '//ul[contains(@class, "products-grid")]//*[@class="product-name"]/a/@href').extract()
     for url in products:
         yield Request(response.urljoin(url),
                       callback=self.parse_product,
                       meta={'brand': brand_name})
Ejemplo n.º 24
0
    def parse(self, response):
        """ parse first response
        """
        if self.spider_config['url_type'] == 'list_page':
            sel = Selector(response)
            box = sel.xpath(self.spider_config['list_xpath'])
            for x in box:
                item = DynamicItem(self.spider_config['item'])
                for key, value in self.spider_config['xpath']['keys'].iteritems():
                    result = x.xpath(value).extract()
                    if len(result) == 1:
                        # single value
                        item[key] = result[0]
                    else:
                        item[key] = result

                # construct follow request if configured
                if self.spider_config['xpath']['follow'] is not None:
                    # more to follow
                    follow_config = self.spider_config['xpath']['follow']
                    if len(follow_config['follow_info']['url'].keys()) >= 2:
                        # needs string formation
                        arguments = dict()
                        for key, value in follow_config['follow_info']['url'].iteritems():
                            # construct arguments
                            if not key == 'base_url':
                                arguments[key] = item[value]
                        url = follow_config['follow_info']['url']['base_url'].format(**arguments)
                    else:
                        url = follow_config['follow_info']['url']['base_url']

                    request = Request(url, callback=self.parse_follow)
                    request.meta['item'] = item
                    request.meta['config'] = follow_config

                    yield request
                else:
                    # no follow request, so save the item
                    get_model('barrow', 'SpiderResult').objects.add_result(spider_task=self.spider_task,
                                                                           item=item,
                                                                           unique=self.spider_config['unique_result'],
                                                                           unique_keys=self.spider_config[
                                                                               'unique_keys'])
Ejemplo n.º 25
0
    def parse(self, response):
        # items = []
        hxs = HtmlXPathSelector(response)
        courses = hxs.select('//td[@class="nttitle"]/a')
        for c in courses:
            item = Course()

            url = c.select("@href").extract()[0]
            data = dict(e.split("=") for e in url.split("?")[1].split("&"))
            desc = c.select("text()").extract()[0].split(" - ")[1].strip()
            item["url"] = url
            item["number"] = data["crse_numb_in"]
            item["department"] = data["subj_code_in"]
            item["desc"] = desc

            request = Request("https://www.uvic.ca" + url, callback=self.parse_details)
            request.meta["item"] = item

            yield request
Ejemplo n.º 26
0
    def parse_city_info(self, response):

        ct = Selector(response)

        # 获取总页数
        total_pages = ct.xpath(
            '//*[@id="citylistpagination"]/div/a[7]/@data-page').extract()[0]

        for page in range(1, int(total_pages) + 1):
            yield Request(self.cities_url.format(page=page),
                          callback=self.parse)
    def parse(self, response):
        pageinformation = response.xpath('//*[@id="threadlisttableid"]')
        hxs = HtmlXPathSelector(response)
        march_re = r'">\s*(.*)\<'

        #for eachstudent in pageinformation:
        item = AdmissionInformation()
        item['admission_time'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[1]').re(
                r'">\s*(.*)\<')
        item['gre'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[3]').re(
                r': \s*(.*)\</font>')
        item['gpa'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[5]').re(
                r'">\s*(.*)\<')
        item['undergrad_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[6]').re(
                r'>(.*)\</font>')
        item['major'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[4]').re(
                r'color="green">\s*(.*)\<')
        item['english_grade'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[2]').re(
                r'>:\s*(.*)\<')
        item['year'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[1]').re(
                r'\[(.*)\<')
        item['admission_type'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[2]').re(
                r'">\s*(.*)\<')
        item['admission_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[5]').re(
                r'">\s*(.*)\<')
        item['admission_major'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[4]/b').re(
                r'<b>\s*(.*)\<')
        item['title'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/a[2]/text()').extract()
        item['status'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[3]/b').re(
                '<b>\s*(.*)\<')
        links = hxs.xpath('//*[contains(@id,"normalthread")]/tr/th/a[2]').re(
            r'href\="([^\"]*)\"')
        urls_real = []
        for each in links:
            urls_real.append(each.replace('&amp;', '&'))
            #print('url is:' + each.replace('&amp;','&'))
        item['link'] = urls_real

        yield item
        next_url = self.get_next_url(response.url)
        if next_url != None:
            yield Request(next_url)
Ejemplo n.º 28
0
 def parse(self, response):
     """通过 xpath 获取热门电子书的链接"""
     sel = Selector(response)
     sites = sel.xpath(
         '//div[@class="section ebook-area"]//ul[@class="list-col list-col5"]/li//div[@class="title"]'
     )
     for site in sites:
         title = site.xpath('a/@title').extract()
         link = site.xpath('a/@href').extract()
         title, link = title[0], link[0]
         # print title, link
         yield Request(url=link, callback=self.parse2)
Ejemplo n.º 29
0
 def parse_item(self, response):
     #print(response.url)
     item = MzituScrapyItem()
     item['url'] = response.url
     title = response.xpath('//h2[@class="main-title"]/text()').extract()[0]
     item['name'] = title
     max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0]
     for i in range(1,int(max_num)):
         page_url = response.url+"/"+str(i)
         yield Request(page_url,callback= self.get_image_url)
     item['image_urls'] = self.img_urls
     yield item
Ejemplo n.º 30
0
 def parse(self, response):
     based_url = "http://blog.csdn.net"
     list_result = ["http://blog.csdn.net/Temanm/article/list/1"]
     soup = BeautifulSoup(response.body, 'html.parser')
     pages = soup.find("div",
                       "list_item_new").find("div",
                                             "pagelist").find_all("a")
     for i in range(len(pages)):
         href = based_url + pages[i].get("href")
         if href not in list_result:
             list_result.append(href)
     for link in list_result:
         yield Request(link, callback=self.parse_link)
Ejemplo n.º 31
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = ((response.css("div.div_list .div_item .div_title a")) or
                      (response.css("div.st_div .div_item .div_itemtitle a")))

        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "div.myp2c_div_paging  ::attr(href)").extract_first("")
            #(未实现)  解析javasrip下一页,本处是自动提取下一个也页面的关键
            #     javascript:fn_loaditems_id_6a4e96a3_7f4b_46f4_b383_5c6b27673ec3(2)'
            # t _url = response.css("div.myp2c_div_paging  ::attr(href)").extract_first("")
            # next_url = document.execCommand(t_url)

            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Ejemplo n.º 32
0
    def parse(self, response):

        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items = self.putItem(status)
        yield items
        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'], str(1))
            yield Request(reposts_url, callback=self.getReposters)
Ejemplo n.º 33
0
 def start_requests(self):
     keys = self._get_search_keys()
     for k in keys:
         yield Request(
             url='http://bcc.blcu.edu.cn/zh/search/0/{0}'.format(
                 urllib.quote(k)),
             meta={
                 'dont_filter': True,
                 'dont_merge_cookies': True
             }
             # 'dont_redirect': True,
             # 'handle_httpstatus_list': [302]}
         )
Ejemplo n.º 34
0
 def parse_details(self, response):
     """
     Parse class prerequisites.
     """
     hxs = HtmlXPathSelector(response)
     prereqs = hxs.select("//span[text()='Faculty']/following-sibling::text() | //span[text()='Faculty']/following-sibling::a")
     self.log("parsing "+response.url,level=DEBUG)
     self.log('prereqs = '+str(prereqs),level = DEBUG)
     prereqs = self.parse_prereqs(prereqs)
     self.log('parsed prereqs = '+str(prereqs),level = DEBUG)
     
     item = response.meta['item']
     item['prereqs'] = prereqs
     
     yield item
     
     calendar_url = self.calendar_url_template.format(subject=item['subject'],
                                                      number=item['number'])
     request = Request(calendar_url,callback=self.parse_calendar)
     request.meta['handle_httpstatus_list'] = [404]
     request.meta['subject'] = item['subject']
     request.meta['number'] = item['number']
     yield request
Ejemplo n.º 35
0
    def parse_term(self,response):
        """
        Parses the schedule search page for a particular term.
        
        Extracts the subject list from the first select box and generates
        requests for the schedule for each subject in the current term. These
        requests are handled by the parse_schedule callback.
        """
        hxs = HtmlXPathSelector(response)
        if not TEST_RUN:
            subjects = hxs.select('//select[@id="subj_id"]/child::option').select('@value').extract()
        else:
            subjects = TEST_SUBJECTS
        term = response.meta['term']
#        self.log('Got subjects; '+str(subjects))        
        
        for subj in subjects:
            url = self.schedule_url_template.format(term=term,subject=subj,number='')
            item = ScheduleItem()
            item['term'] = term
            item['subject'] = subj
            request = Request(url,callback=self.parse_schedule)
            request.meta['item'] = item
            yield request
Ejemplo n.º 36
0
    def parse_city(self, response):

        ct = Selector(response)

        # 获取攻略页链接
        gonglve_link = ct.xpath('//*[@class="navbar-btn"]/@href').extract()[0]
        # 获取城市名
        city_name = response.meta.get('name')

        yield Request(self.domains_url + gonglve_link,
                      callback=self.gong_lve,
                      meta={
                          'name': city_name,
                          'href': gonglve_link
                      })
Ejemplo n.º 37
0
 def parse(self, response):
     item = LianjiafItem()
     data = BeautifulSoup(response.text, 'lxml').find_all('li',
                                                          class_='clear')
     for tag in data:
         page_url = response.url
         title = tag.find('div', class_='title').get_text()
         url = tag.div.find('a', attrs={'data-el': 'ershoufang'})['href']
         type = tag.find('div', class_='houseInfo').get_text()
         price = tag.find('div',
                          class_='totalPrice').get_text().replace('万', '')
         for field in item.fields:
             item[field] = eval(field)
         yield item
     page = response.xpath('//div[@comp-module="page"]').re(
         'lPage\"\:(\d+)')[0]
     for u in range(1, int(page) + 1):
         urls = 'https://bj.lianjia.com/ershoufang/pg{}'.format(u)
         yield Request(urls, callback=self.parse)