def getReposters(self, response):

        pattern = 'page=(\d+)'
        result = re.search(pattern, response.url)
        page_id = result.group(1)
        try:
            json_data = json.loads(response.text)
            data = json_data['data']
            reposts_data = data['data']

            if int(page_id) == 1:
                self.max = data['max']

            for item in reposts_data:
                items = self.putItem(item)
                time_url = base_mblog_url % (items['mblog_id'])
                yield Request(url=time_url,
                              meta={"item": items},
                              callback=self.get_accurate_time)
        except:
            pass

        if int(page_id) < int(self.max):
            reposts_url = re.sub(pattern, 'page=' + str(int(page_id) + 1),
                                 response.url)
            yield Request(reposts_url, callback=self.getReposters)
Beispiel #2
0
    def start_requests(self):
        for row in self.tyre_sizes:
            if self.check_row_is_processed(row):
                continue

            self.add_row_to_history(row)

            meta = {'row': row}
            xl = ''
            if row['XL'] == 'XL':
                xl = 'Y'
                meta['xl'] = True

            run_flat = ''
            if row['Run Flat'] == 'RF':
                run_flat = 'Y'
                meta['run_flat'] = True

            url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[
                    'Rim'] + '&speed=' + row[
                        'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
            yield Request(url, dont_filter=True, meta=meta)

            if row['Alt Speed']:
                url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                    'Width'] + '&profile=' + row[
                        'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[
                            'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
                yield Request(url, dont_filter=True, meta=meta)
Beispiel #3
0
 def parse(self, response):
     """docstring for parse"""
     yield Request('http://www.ag.senate.gov/hearings', self.parse_data)
     for links in range(2, 10):
         yield Request(
             'http://www.ag.senate.gov/hearings?PageNum_rs=' + str(links),
             self.parse_data)
Beispiel #4
0
    def parse(self, response):
        sel = Selector(response)

        # Pages from 1998 onwards, new format
        # These normally cover around a 2-6 year period
        proceedings_menu = sel.xpath(
            '//a[starts-with(text(),"Official Record of Proceedings")]/@href')
        if proceedings_menu:
            for url in proceedings_menu.extract():
                absolute_url = urlparse.urljoin(response.url, url.strip())
                req = Request(absolute_url,
                              callback=self.parse_hansard_index_page)
                yield req

        # Former Legislative Council (before 7/1997)
        table = sel.xpath(
            "//h3[contains(text(),'Former Legislative Council (before 7/1997)')]/following::table[1]"
        )
        if table:
            links = table[0].xpath(
                ".//td/a[contains(text(),'Session')]/@href").extract()
            if links:
                for url in links:
                    absolute_url = urlparse.urljoin(response.url, url.strip())
                    req = Request(absolute_url,
                                  callback=self.parse_hansard_index_page)
                    yield req
Beispiel #5
0
    def parse(self, response):
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern,response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)
        status = json_data['status']
        try:
            is_retweeted = status['retweeted_status']
            mblog_id = is_retweeted['id']
            mblog_url = base_mblog_url % (mblog_id)
            return Request(mblog_url, callback=self.getBlog)
        except:
            pass
        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items=self.putItem(status)
        usrdetail_url = base_usrdetail_url % (items['usr_id'])
        yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)

        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'],str(1))
            yield Request(reposts_url,callback=self.getReposters)
Beispiel #6
0
    def parse(self, response):
        for productxs in response.xpath(
                '//div[contains(@class, "products-list")]//div[@data-product]'
        ):
            yield Request(productxs.xpath(
                './/a[@class="product-card-link"]/@href').extract()[0],
                          callback=self.parse_product)

        next_page = response.xpath('//link[@rel="next"]/@href').extract()
        if next_page and not 'Page.Next.Link' in next_page[0]:
            yield Request(response.urljoin(next_page[0]))
Beispiel #7
0
    def parse(self, response):
        item = BaiduItem()
        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.select('.lemmaWgt-lemmaTitle-title h1')[0].text
        subhead = soup.select('.lemmaWgt-lemmaTitle-title h2')
        if len(subhead) is not 0:
            # print(subhead[0].text)
            title = title + subhead[0].text
        item['title'] = title

        info_list = soup.select('.lemma-summary div')
        info = ''
        for temp in info_list:
            # 截取其中文字信息
            info += temp.text
            # 如果有超链接,则继续爬取
            a_list = temp.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['info'] = info

        properties_list = soup.select('.basicInfo-block dt')
        properties = ''
        for pro in properties_list:
            properties += '###' + pro.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = pro.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['properties'] = properties

        values_list = soup.select('.basicInfo-block dd')
        values = ''
        for val in values_list:
            values += '###' + val.text.strip().replace('\n', '')
            # 如果有超链接,则继续爬取
            a_list = val.select('a')
            if len(a_list) is not 0:
                for a in a_list:
                    if a.has_attr('href'):
                        yield Request(self.base_url + a['href'], headers=self.headers)
        item['values'] = values

        if len(soup.select('.summary-pic img')) is not 0:
            item['img'] = soup.select('.summary-pic img')[0]['src']

        print(item['title'])

        yield item
    def parse_item(self, response):

        m2 = hashlib.md5()
        m2.update(response.url)

        youm = Myoum7Item()
        youm['url'] = response.url
        youm['page_name'] = m2.hexdigest()
        youm['do_main'] = 'm.youm7.com'


        referer = response.request.headers.get('Referer', None)

        reattr = re.findall(r"sectionID=(\w+)", referer)
        if reattr:
            youm['url'] = referer
            yield Request(referer, callback=self.parse_item)

        date_str = response.xpath('//div[@class="news-dev"]/@data-id')

        attr = re.findall(r"sectionID=(\w+)", response.url)

        if date_str and attr:
            sectionID = attr[0]
            date_str = date_str.extract()
            url_date = date_str[len(date_str)-1]
            newUrl = "https://m.youm7.com/Section/NewsSectionPaging?lastid="+url_date+"&sectionID="+str(sectionID)
            youm['url'] = newUrl
            yield Request(newUrl, callback=self.parse_item)

        title_str = response.xpath('//title/text()')

        content_str = response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')

        type_str =response.xpath('//div[@class="container"]//div[@class="breadcumb"]//a/text()')#菜单中的分类

        if content_str and title_str:
            content = ""
            content_str =  response.xpath('//div[@class="text-cont"]//div[@id="articleBody"]//p/text()')
            for s in content_str.extract():
                content += s

            youm['title'] = title_str.extract()[0]
            youm['content'] = content
            youm['str_size'] = len(content)
            youm['type'] = type_str.extract()[1] #取详细页中的分类



        yield youm
Beispiel #9
0
 def getReposters(self,response):
     json_data = json.loads(response.text)
     data = json_data['data']
     max = data['max']
     reposts_data = data['data']
     pattern = 'page=(\d+)'
     result = re.search(pattern,response.url)
     page_id = result.group(1)
     for item in reposts_data:
         items=self.putItem(item)
         usrdetail_url = base_usrdetail_url % (items['usr_id'])
         yield Request(url=usrdetail_url, meta={"item": items}, callback=self.getUsrDetail)
     if int(page_id) < int(max):
         reposts_url = re.sub(pattern,'page='+str(int(page_id)+1),response.url)
         yield Request(reposts_url,callback=self.getReposters)
Beispiel #10
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("div.list_body_box1 .art_tit a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)
            # 提取下一页并交给scrapy进行下载

            # !!!!!!!!!!!!!!!!!提取下一页未完成
            next_url = response.css('div.list_body_box1 .pagingNormal a')
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Beispiel #11
0
    def parse(self, response):
        self.log("正在解析第{}页".format(self.current_page))

        no_data = response.xpath(".//div[@class='ico_list']/div[@class='no_data']")
        if no_data or self.current_page > self.max_page:
            self.log("no data = {}".format(no_data))
            self.log("没有数据或超过指定页,爬虫退出!最大爬取页为:{}".format(self.max_page))
            return

        uris = response.xpath(".//div[@class='content']/a/@href").extract()
        for uri in uris:
            yield Request(self.domains + uri, self.parse_detail)

        self.current_page += 1
        yield Request(self.base_url.format(self.current_page), self.parse)
Beispiel #12
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = response.css("ul.listbox li a")
        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "span.next_page a::attr(href)").extract()[0]
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Beispiel #13
0
 def parse(self, response):
     original_key = response.url.split('/')[-1]
     key = urllib.unquote(original_key.split('?')[0])
     texts = response.xpath('//tbody/tr/td/text()').extract()
     filename = os.getenv('RESULT_PATH')
     texts = [t.encode('utf-8') for t in texts if '\n' not in t]
     merged_texts = []
     for i in xrange(0, len(texts)):
         index = i % 2
         if index == 0:
             merged_texts.append(texts[i] + texts[i + 1] + '\n')
     # print 'lines num:', len(merged_texts)
     # not_200_path = os.getenv('NOT_200')
     # if response.status != 200:
     #     with open(key+'\t'+str(len(set(merged_texts)))+'\n')
     legacy_file_path = os.getenv('LEGACY_PATH')
     if len(merged_texts) == 100:
         with open(legacy_file_path, 'a') as legacy_file:
             legacy_file.write(key + '\n')
     with open(filename, 'a') as f:
         f.write(key + '\t' + str(len(set(merged_texts))) + '\n')
     if len(merged_texts) > 0:
         detail_urls = response.xpath('//tbody/tr/td/a/@href').extract()
         for d in detail_urls:
             print "detail url is %s \n" % d
             yield Request(url='http://bcc.blcu.edu.cn{0}'.format(d),
                           meta={
                               'dont_filter': True,
                               'dont_merge_cookies': True
                           },
                           callback=self.parse_detail)
    def parse_comment(self, response):
        comments_json = json.loads(response.body_as_unicode())
        if comments_json['message'] == 'success':
            comments = comments_json['data']['comments']
            if len(comments) > 0:
                items = []
                for comment in comments:
                    item = CommentInfo({
                        'comment': comment['text'],
                        'likes': comment['digg_count'],
                        'time': comment['create_time'],
                        'comment_id': comment['id']
                    })
                    self.copy_article_info(response.meta, item)

                    if comment['reply_count'] > 0:
                        reply_to_comment_url = 'http://www.toutiao.com/api/comment/get_reply/?comment_id=' + str(
                            comment['id']) + '&dongtai_id=' + str(
                                comment['dongtai_id']
                            ) + '&offset=0&count=' + str(
                                comment['reply_count'])
                        reply_request = Request(reply_to_comment_url,
                                                callback=self.parse_reply,
                                                method='GET')
                        self.copy_article_info(response.meta,
                                               reply_request.meta)
                        reply_request.meta['reply_to_id'] = comment['id']
                        yield reply_request

                    items.append(item)
                return items
        else:
            return
Beispiel #15
0
    def parse(self, response):
        item = {}
        imgurl = response.xpath(
            "//div[@id='waterfall']/div[@class='pin wfc wft']/a/img/@src"
        ).extract()
        for i in range(len(imgurl)):
            item["name"] = self.title
            item["imgurl"] = imgurl[i]

            item["imgherf"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/a/@href"
            ).extract()[i]

            item["imgvisit"] = response.xpath(
                "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='repin']/text()"
            ).extract()[i]
            try:
                item["imglike"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p/span[@class='like']/text()"
                ).extract()[i]
            except Exception as e:
                item["imglike"] = "0"
            try:
                item["imgdiscrit"] = response.xpath(
                    "//div[@id='waterfall']/div[@class='pin wfc wft']/p[@class='description']/text()"
                ).extract()[i]
            except Exception as e:
                item["imgdiscrit"] = ""
            yield item
        for i in range(4):
            yield Request(url=response.url,
                          callback=self.next,
                          meta={"page": "2"},
                          dont_filter=True)
Beispiel #16
0
 def parse(self, response):
     data = json.loads(response.body)['result']['products'].values()
     for i in data:
         item = Book_Product()
         item['title'] = i['title']
         item['subtitle'] = i['subtitle']
         item['uid'] = i['permanentProductPageUrl'].split('/')[-1].split('?')[0]
         item['fsp'] = i['fsp']
         item['mrp'] = i['mrp']
         self.items.append(item)
     print len(self.items)
     self.count += 10
     if self.count > 100:
         # < Python 3.3 doesn't allows mixing of return and yield statements in same function.
         # So, we yield another method self.return_data which then returns the result.
         yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.return_data)
     yield Request("http://www.flipkart.com/m/store/buk/loadmore?store=buk&start=%d" % self.count, self.parse)
Beispiel #17
0
 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath(
         '/html/body/div/div[3]/@totalcount').extract()[0]
     totalpage = int(int(totalcount) / 15 + 1)
     for i in range(1, totalpage + 1):
         url = response.url + '?page.pageNo=' + str(i)
         yield Request(url, callback=self.parse_page)
Beispiel #18
0
 def parse_link(self, response):
     based_url = "http://blog.csdn.net"
     soup = BeautifulSoup(response.body, 'html.parser')
     blog = soup.find_all("div", "list_item article_item")
     for item in blog:
         # print item.find("span", "link_title").find("a").get("href"), item.find("span", "link_title").find("a").get_text()
         href = based_url + item.find("span",
                                      "link_title").find("a").get("href")
         yield Request(href, callback=self.parse_get_blog_title)
 def parse(self, response):
     res = Selector(response)
     totalcount = res.xpath('/html/body/script').re('pageCount": .*,')[0]
     pages = int(re.findall('.*(.\d).*', totalcount)[0])
     for i in range(1, pages + 1):
         if i == 1:
             url = response.url
         url = self.page_url.format(str(i))
         yield Request(url, callback=self.parse_page)
Beispiel #20
0
 def parse_products(self, response):
     brand_name = ''.join(response.xpath(
         '//p[contains(@class, "category-image")]/img/@title').extract())
     products = response.xpath(
         '//ul[contains(@class, "products-grid")]//*[@class="product-name"]/a/@href').extract()
     for url in products:
         yield Request(response.urljoin(url),
                       callback=self.parse_product,
                       meta={'brand': brand_name})
Beispiel #21
0
    def parse_city_info(self, response):

        ct = Selector(response)

        # 获取总页数
        total_pages = ct.xpath(
            '//*[@id="citylistpagination"]/div/a[7]/@data-page').extract()[0]

        for page in range(1, int(total_pages) + 1):
            yield Request(self.cities_url.format(page=page),
                          callback=self.parse)
    def parse(self, response):
        pageinformation = response.xpath('//*[@id="threadlisttableid"]')
        hxs = HtmlXPathSelector(response)
        march_re = r'">\s*(.*)\<'

        #for eachstudent in pageinformation:
        item = AdmissionInformation()
        item['admission_time'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[1]').re(
                r'">\s*(.*)\<')
        item['gre'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[3]').re(
                r': \s*(.*)\</font>')
        item['gpa'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[5]').re(
                r'">\s*(.*)\<')
        item['undergrad_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[6]').re(
                r'>(.*)\</font>')
        item['major'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[4]').re(
                r'color="green">\s*(.*)\<')
        item['english_grade'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/font[2]').re(
                r'>:\s*(.*)\<')
        item['year'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[1]').re(
                r'\[(.*)\<')
        item['admission_type'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[2]').re(
                r'">\s*(.*)\<')
        item['admission_school'] = hxs.xpath(
            '//*[contains(@id, "normalthread")]/tr/th/span/u/font[5]').re(
                r'">\s*(.*)\<')
        item['admission_major'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[4]/b').re(
                r'<b>\s*(.*)\<')
        item['title'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/a[2]/text()').extract()
        item['status'] = hxs.xpath(
            '//*[contains(@id,"normalthread")]/tr/th/span/u/font[3]/b').re(
                '<b>\s*(.*)\<')
        links = hxs.xpath('//*[contains(@id,"normalthread")]/tr/th/a[2]').re(
            r'href\="([^\"]*)\"')
        urls_real = []
        for each in links:
            urls_real.append(each.replace('&amp;', '&'))
            #print('url is:' + each.replace('&amp;','&'))
        item['link'] = urls_real

        yield item
        next_url = self.get_next_url(response.url)
        if next_url != None:
            yield Request(next_url)
Beispiel #23
0
 def parse_item(self, response):
     #print(response.url)
     item = MzituScrapyItem()
     item['url'] = response.url
     title = response.xpath('//h2[@class="main-title"]/text()').extract()[0]
     item['name'] = title
     max_num = response.xpath('//div[@class="pagenavi"]/a[last()-1]/span/text()').extract()[0]
     for i in range(1,int(max_num)):
         page_url = response.url+"/"+str(i)
         yield Request(page_url,callback= self.get_image_url)
     item['image_urls'] = self.img_urls
     yield item
Beispiel #24
0
 def parse(self, response):
     """通过 xpath 获取热门电子书的链接"""
     sel = Selector(response)
     sites = sel.xpath(
         '//div[@class="section ebook-area"]//ul[@class="list-col list-col5"]/li//div[@class="title"]'
     )
     for site in sites:
         title = site.xpath('a/@title').extract()
         link = site.xpath('a/@href').extract()
         title, link = title[0], link[0]
         # print title, link
         yield Request(url=link, callback=self.parse2)
Beispiel #25
0
 def parse(self, response):
     based_url = "http://blog.csdn.net"
     list_result = ["http://blog.csdn.net/Temanm/article/list/1"]
     soup = BeautifulSoup(response.body, 'html.parser')
     pages = soup.find("div",
                       "list_item_new").find("div",
                                             "pagelist").find_all("a")
     for i in range(len(pages)):
         href = based_url + pages[i].get("href")
         if href not in list_result:
             list_result.append(href)
     for link in list_result:
         yield Request(link, callback=self.parse_link)
    def parse(self, response):

        pattern = r'var \$render_data = \[((.|\s)*?})\]'
        raw_data = re.search(pattern, response.text)
        raw_data = raw_data.group(1)
        json_data = json.loads(raw_data)

        status = json_data['status']
        items = self.putItem(status)
        yield items
        if status['reposts_count']:
            reposts_url = base_reposts_url % (status['id'], str(1))
            yield Request(reposts_url, callback=self.getReposters)
Beispiel #27
0
    def parse(self, response):
        # 当前页面所用的url
        post_nodes = ((response.css("div.div_list .div_item .div_title a")) or
                      (response.css("div.st_div .div_item .div_itemtitle a")))

        for post_node in post_nodes:
            # 提取开始url
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url),
                          callback=self.parse_detail)

            # 提取下一页并交给scrapy进行下载
            next_url = response.css(
                "div.myp2c_div_paging  ::attr(href)").extract_first("")
            #(未实现)  解析javasrip下一页,本处是自动提取下一个也页面的关键
            #     javascript:fn_loaditems_id_6a4e96a3_7f4b_46f4_b383_5c6b27673ec3(2)'
            # t _url = response.css("div.myp2c_div_paging  ::attr(href)").extract_first("")
            # next_url = document.execCommand(t_url)

            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url),
                              callback=self.parse)
Beispiel #28
0
 def start_requests(self):
     keys = self._get_search_keys()
     for k in keys:
         yield Request(
             url='http://bcc.blcu.edu.cn/zh/search/0/{0}'.format(
                 urllib.quote(k)),
             meta={
                 'dont_filter': True,
                 'dont_merge_cookies': True
             }
             # 'dont_redirect': True,
             # 'handle_httpstatus_list': [302]}
         )
Beispiel #29
0
    def parse_city(self, response):

        ct = Selector(response)

        # 获取攻略页链接
        gonglve_link = ct.xpath('//*[@class="navbar-btn"]/@href').extract()[0]
        # 获取城市名
        city_name = response.meta.get('name')

        yield Request(self.domains_url + gonglve_link,
                      callback=self.gong_lve,
                      meta={
                          'name': city_name,
                          'href': gonglve_link
                      })
Beispiel #30
0
 def parse(self, response):
     item = LianjiafItem()
     data = BeautifulSoup(response.text, 'lxml').find_all('li',
                                                          class_='clear')
     for tag in data:
         page_url = response.url
         title = tag.find('div', class_='title').get_text()
         url = tag.div.find('a', attrs={'data-el': 'ershoufang'})['href']
         type = tag.find('div', class_='houseInfo').get_text()
         price = tag.find('div',
                          class_='totalPrice').get_text().replace('万', '')
         for field in item.fields:
             item[field] = eval(field)
         yield item
     page = response.xpath('//div[@comp-module="page"]').re(
         'lPage\"\:(\d+)')[0]
     for u in range(1, int(page) + 1):
         urls = 'https://bj.lianjia.com/ershoufang/pg{}'.format(u)
         yield Request(urls, callback=self.parse)