Ejemplo n.º 1
0
    def roadwork_data_parse(self, response):
        jdata = json.loads(response.body)
        events = jdata[u'data']

        for event in events:
            item = MyspiderItem()
            enc_event_text = event[u'remark'].strip().replace(
                '\n', ' ').replace('\r', '').encode('utf-8')

            X = event[u'coor_x']
            Y = event[u'coor_y']

            if len(event) < 1:  # null-checking
                continue

            item['ID'] = event[u'eventid']
            item['POSTFROM'] = u'福建高速公路'
            item['CONTENT'] = enc_event_text
            item['TITLE'] = u'LOCATION AT {' + str(Y) + u' ,' + str(X) + u'}'
            item['DIRECTION'] = (event[u'occplace'] + event[u'startnodename'] +
                                 u'-' + event[u'endnodename']).encode('utf-8')
            item['POSTDATE'] = (event[u'intime']).encode('utf-8')
            item['EVENTTYPE'] = u'路况施工'
            item['START_TIME'] = (event[u'occtime']).encode('utf-8')
            item['END_TIME'] = (event[u'planovertime'])
            item['COLLECTDATE'] = datetime.datetime.today().strftime(
                '%Y-%m-%d')
            item['REF'] = self.start_urls[0][-1:] + '4'
            yield item
Ejemplo n.º 2
0
    def parse(self, response):
        logging.info("*" * 100)
        logging.info(u"爬虫开始")
        li_list = response.xpath("//ul[@class='wp-list clearfix']//li")
        for li in li_list:
            urls = []
            small_url = li.xpath(
                ".//div[@class='pic']/a/img/@src").extract_first()
            urls.append(small_url)
            item = MyspiderItem()
            item['image_urls'] = urls
            detail_href = li.xpath(
                ".//div[@class='pic']/a/@href").extract_first().replace(
                    "http", "https")
            yield scrapy.Request(
                url=detail_href,
                callback=self.parse_detail,
                meta={'item': item},
            )
        next_url = response.xpath(u"//a[text()='下一页1']/@href").extract_first()
        if next_url is not None:

            next_url = 'https://meizitu.com/a/' + next_url
            logging.info("*" * 100)
            logging.info("开如睡眠10分钟")
            logging.info(next_url)
            time.sleep(600)
            logging.info("睡眠结束继续爬行")
            logging.info("*" * 100)
            yield scrapy.Request(next_url, callback=self.parse)
        else:
            logging.info("=" * 100)
            logging.info("--------->spider close<---------")
Ejemplo n.º 3
0
 def parse(self, response):
     jrlist = response.xpath("//div[@class='j-r-list']/ul/li")
     # print("************************************************************")
     print(type(jrlist))
     # print("************************************************************")
     for jrlistli in jrlist:
         author = jrlistli.xpath(
             ".//div[@class='j-list-user']//div[@class='u-txt']//a/text()"
         ).get()
         content = jrlistli.xpath(
             ".//div[@class='j-r-list-c']//div[@class='j-r-list-c-desc']//a/text()"
         ).get()
         # print(f"author:{author}")
         # print(f"content:{content}")
         item = MyspiderItem(author=author, content=content)
         # duanzi = {"author":author,"content":content}
         # print(duanzi)
         print(item)
         yield item
     next_url = response.xpath(
         "//div[@class='j-page']//a[@class='pagenxt']/@href").get()
     print(
         "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
     )
     next_url_full = "http://www.budejie.com/text/" + next_url
     print(next_url_full)
     print(int(next_url))
     if int(next_url) >= 50:
         return
     else:
         yield scrapy.Request(next_url_full,
                              callback=self.parse,
                              dont_filter=True)
Ejemplo n.º 4
0
    def parse_item(self, response):
        item = MyspiderItem()
        print("######################")
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i
        print(response.url)
        selector = Selector(response)
        title = selector.xpath('//h4/em/text()').extract()[0]
        # title1 = selector.xpath('//h4/em/text()')
        # print('#########################')
        # print(title1)
        # print(type(title1))
        # print(title1[0].extract())
        # print('#####################')
        address = selector.xpath('//p/span[@class="pr5"]/text()').extract()[0].strip()
        price = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()').extract()[0]
        lease_type = selector.xpath('//*[@id="introduce"]/li[1]/h6/text()').extract()[0]
        suggestion = selector.xpath('//*[@id="introduce"]/li[2]/h6/text()').extract()[0]
        bed = selector.xpath('//*[@id="introduce"]/li[3]/h6/text()').extract()[0]

        item['title'] = title
        item['address'] = address
        item['price'] = price
        item['lease_type'] = lease_type
        item['suggestion'] = suggestion
        item['bed'] = bed
        item['crawled_time'] = self.crawled_time

        yield item
Ejemplo n.º 5
0
    def fill_in_items(self, response):
        # parse json and fill them into items

        item = MyspiderItem()
        data = json.loads(response.body)
        real_data = data[u'roadEvents'][u'roadEvents']
        strnow = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        for row in real_data:
            item['ID'] = row[u'eventId']
            item['ROADNAME'] = row[u'roadName'].encode('utf-8')
            item['COLLECTDATE'] = strnow
            item['EVENTTYPE'] = row[u'eventType']
            item['DIRECTION'] = row[u'dealCase'].encode('utf-8')
            item['EVENTTYPE'] = row[u'eventType']

            item['START_TIME'] = row[u'occurTime']
            item['END_TIME'] = row[u'endTime']

            item['CONTENT'] = row[u'description'].strip().replace(
                '\n', ' ').replace('\r', '').encode('utf-8')
            item['TITLE'] = (row[u'roadName'] + u'-location at: ' +
                             row[u'lonlatData']).encode('utf-8')
            item['POSTDATE'] = strnow
            item['POSTFROM'] = u'北京市路政局公路出行信息服务站'
            item['REF'] = 'http://glcx.bjlzj.gov.cn/bjglwww/index.shtml'
            yield item
Ejemplo n.º 6
0
 def parse(self, response):
     r = response.xpath('//div[@class="mw-parser-output"]//tr')
     pmItem = MyspiderItem()
     for i in range(len(r)):
         try:
             rlist = r[i].xpath('td')
             newlist = []
             for j in range(4):
                 if j == 0:
                     templist = []
                     templist.append(
                         rlist[j].xpath('text()').extract()[0].strip('\n'))
                     newlist.append(templist)
                 else:
                     newlist.append(rlist[j].xpath('a//text()').extract())
             pmItem["pm_dexId"] = newlist[0][0]
             pmItem["pm_name_cn"] = newlist[1][0]
             pmItem["pm_name_jp"] = newlist[2][0]
             pmItem["pm_name_en"] = newlist[3][0]
             pmItem["pm_url"] = self.domain + r[i].xpath(
                 'td/a/@href').extract_first()
             yield scrapy.Request(pmItem["pm_url"],
                                  callback=self.parse_detail,
                                  meta=pmItem)
         except IndexError:
             continue
Ejemplo n.º 7
0
    def parse_item(self, response):
        items = []
        content = Selector(response)
        list1 = content.css('article.excerpt.excerpt-c3')
        # title = content.css('div.container')

        for each in list1:
            item = MyspiderItem()
            item['bookName'] = content.xpath('//head/title/text()').extract()
            item['chapterUrl'] = each.xpath('a/@href').extract()
            # cont = i.css('.a').xpath('text()').extract()
            # uurl = i.css('.a').xpath('@href').extract()
            text = each.css('a::text').extract()
            for i in text:
                try:
                    item['bookTitle'] = i.split(' ')[0]
                    item['chapterNum'] = i.split(' ')[1]
                except Exception, e:
                    continue

                try:
                    item['chapterName'] = i.split(' ')[2]
                except Exception, e:
                    item['chapterName'] = i.split(' ')[1][-3:]
                items.append(item)
Ejemplo n.º 8
0
    def parse(self, response):
        data = json.loads(response.body.decode('gb18030').encode('utf8'))

        strn = datetime.datetime.today().strftime('%Y-%m-%d')
        for case in data[u'LUWSJSSB']:
            item = MyspiderItem()

            item['COLLECTDATE'] = strn

            item['ROADNAME'] = case[u'LUDMC']
            item['POSTFROM'] = u'江苏省交通运输厅'

            item['EVENTTYPE'] = self.event_type_switcher(case[u'SHIJLX'])
            item['DIRECTION'] = case[u'FANGX'] + u"-" + case[u'LUXBSM']
            item['START_TIME'] = case[u'SHIFSJ']
            item['POSTDATE'] = case[u'CHUANGJSJ']
            item['END_TIME'] = case[u'YUJHFSJ']

            item['CONTENT'] = case[u'SHIJNR'].strip().replace('\n', ' ').replace('\r', '')
            item['TITLE'] = (u'locate at x: ' + case[u'X'] + u' Y: ' + case[u'Y']).strip().replace('\n', ' ').replace(
                '\r', '')

            item['REF'] = self.start_urls[0]

            yield item
Ejemplo n.º 9
0
    def parse_sale(self, response):
        item = MyspiderItem()
        item._values = response.meta

        tr_list = response.xpath("/html/body/div[5]/div[3]/div[2]/div/div[2]/div[1]/table/tr")
        item["S_S_date"] = []
        item["S_S_M_sale"] = []
        item["S_S_M_rank"] = []
        item["S_S_rank"] = []
        for tr in tr_list[1: ]:

            item["S_S_date"].append(tr.xpath("./td/text()").extract()[0]) # 时间数组
            item["S_S_M_sale"].append(tr.xpath("./td/text()").extract()[1]) # 相应时间月销量数组
            item["S_S_M_rank"].append(tr.xpath("./td/a/text()").extract()[0]) # 相应时间月销量排名数组
            item["S_S_rank"].append(tr.xpath("./td/text()").extract()[2]) # 相应时间占厂商份额数组
        
        yield item
Ejemplo n.º 10
0
 def parse(self, response):
     self.log(response.headers)
     # 获取 freebuf 首页所有的图片, 以列表形式保存到 image_urls 字段中。
     piclist = response.xpath("//div[@class='news-img']/a/img/@src").extract()
     if piclist:
         item = MyspiderItem()
         item['image_urls'] = piclist
         yield item
Ejemplo n.º 11
0
    def parse(self, response):
        node_list = response.xpath('//div[@class="li_txt"]')
        # data_list = []
        for node in node_list:
            item = MyspiderItem()
            item['name'] = node.xpath('./h3/text()').extract_first().strip()
            item['title'] = node.xpath('./h4/text()').extract_first().strip()
            item['desc'] = node.xpath('./p/text()').extract_first().strip()

            yield item
Ejemplo n.º 12
0
    def parse(self, response):

        items = MyspiderItem()
        result = json.loads(response.text)
        data = result.get('result').get('data')
        for i in range(10):
            items['create_time'] = data[i].get('createTime')
            items['title'] = data[i].get('title')
            items['videoUrl'] = data[i].get('videoUrl')
            items['author'] = data[i].get('nick')
            print(items['title'])
            yield items
Ejemplo n.º 13
0
    def parse(self, response):

        html = Selector(response)
        images_urls = html.xpath('//li/a[1]/img/@original').extract()
        images_name = html.xpath('//li/a[1]/p/text()').extract()

        for index in range(len(images_urls)):
            item = MyspiderItem()
            item['img_url'] = images_urls[index]
            item['img_name'] = images_name[index]
            #            print('name:%s--------url:%s' %(item['img_name'], item['img_url']))
            yield item
Ejemplo n.º 14
0
 def parse(self, response):
     #处理start_url对应的响应
     res1 = response.xpath(
         "//div[@id='browse-journals-output']//div[@class='hide-body']//a")
     item = MyspiderItem()
     for url in res1:
         name = url.xpath("./text()").extract_first()
         tempurl = url.xpath(".//@href").extract()[0]
         if re.match('IEEE', name, re.IGNORECASE):
             item["journal"] = name
             yield scrapy.Request(url=tempurl,
                                  callback=self.parse_joural_url,
                                  meta={"item": item})
Ejemplo n.º 15
0
    def parse_detail(response):

        # 通过 CSS 选择器找出具体值
        title = response.css("h1>span:nth-child(1)::text").extract_first()
        rating = response.css('.rating_num::text').extract_first()
        
        # 实例化对象
        subject_item = MyspiderItem()
        subject_item['title'] = title
        subject_item['douban_link'] = response.url
        subject_item['rating'] = rating

        # 移交 pipeline 流水线 
        yield subject_item
Ejemplo n.º 16
0
 def parse(self, response):
     res_list = response.xpath('//div[@class="li_txt"]')
     arr = []
     for i in res_list:
         item = MyspiderItem()
         name = i.xpath('./h3/text()').extract()[0]
         title = i.xpath('./h4/text()').extract()[0]
         info = i.xpath('./p/text()').extract()[0]
         #print name
         item['name'] = name.encode('gbk')
         item['title'] = title.encode('gbk')
         item['info'] = info.encode('gbk')
         arr.append(item)
     return arr
Ejemplo n.º 17
0
 def parse_1(self, response):
     #item = response.meta["item"]
     soup = BeautifulSoup(response.text,"html.parser")
     title = soup.select('.content > h1:nth-child(1)')[0].text
     title1 = "\r\n\r\n\r\n\r\n\r\n    "+title+"\r\n\r\n\r\n\r\n    "
     text = soup.select('#content')[0].text.split("https")[0]
     text = '\r\n\r\n    '.join(text.split())
     item = MyspiderItem()
     item['title'] = title
     item['text']  = text
     yield item
     #href = soup.select('.page_chapter > ul:nth-child(1) > li:nth-child(3) > a:nth-child(1)')[0]['href']
     #if not href.endswith('/'):
         #next_url = 'https://www.biqiuge8.com' + href'''
Ejemplo n.º 18
0
 def parse(self, response):
     item = MyspiderItem()
     tittle = response.xpath(
         '//*[@id="js_origina_column"]/div/div/div/div/ul/li/div/ul/li/a/text()'
     ).extract()
     url = response.xpath(
         '//*[@id="js_origina_column"]/div/div/div/div/ul/li/div/ul/li/a/@href'
     ).extract()
     for t in tittle:
         item['tittle'] = t
     for u in url:
         item['url'] = u
         logger.warning(item)
     yield item
Ejemplo n.º 19
0
    def parse_sale(self, response):
        item = MyspiderItem()
        item._values = response.meta

        tr_list = response.xpath(
            "/html/body/div[5]/div[3]/div[2]/div/div[2]/div[1]/table/tr")
        item["B_S_date"] = []
        item["B_S_sale"] = []
        item["B_S_share"] = []
        item["B_S_detail"] = []
        for tr in tr_list[1:]:

            item["B_S_date"].append(
                tr.xpath("./td/text()").extract()[0])  # 时间数组
            item["B_S_sale"].append(
                tr.xpath("./td/text()").extract()[1])  # 相应时间销量数组
            item["B_S_share"].append(
                tr.xpath("./td/text()").extract()[2])  # 相应时间市场份额数组
            item["B_S_detail"].append(
                'https://xl.16888.com' +
                tr.xpath("./td/a/@href").extract()[0])  # 相应时间市场份额链接数组

        yield item
Ejemplo n.º 20
0
 def parse(self, response):
     for item in response.css('.item'):
         movie = MyspiderItem()
         movie['Staring'] = item.css('.bd p::text').extract_first()
         movie['rank'] = item.css('.pic em::text').extract_first()
         movie['title'] = item.css('.hd span.title::text').extract_first()
         movie['start'] = item.css('.star span.rating_num::text').extract_first()
         movie['quote'] = item.css('.quote span.inq::text').extract_first()
         movie['url'] = item.css('.pic a::attr("href")').extract_first()
         movie['image_url'] = item.css('.pic img::attr("src")').extract_first()
         yield movie
     next_url = response.css('span.next a::attr("href")').extract_first()
     if next_url is not None:
         url = self.start_urls[0] + next_url
         yield scrapy.Request(url=url, callback=self.parse)
Ejemplo n.º 21
0
    def parse_item(self, response):
        # i = {}
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        # return i
        node_list = response.xpath('//div[@class="li_txt"]')
        # data_list = []
        for node in node_list:
            item = MyspiderItem()
            item['name'] = node.xpath('./h3/text()').extract_first().strip()
            item['title'] = node.xpath('./h4/text()').extract_first().strip()
            item['desc'] = node.xpath('./p/text()').extract_first().strip()

            yield item
Ejemplo n.º 22
0
    def parse(self, response):
        # 获取所有老师信息的节点
        node_list = response.xpath('//div[@class="li_txt"]')
        # 遍历所有的教师节点
        data_list = []
        for node in node_list:
            # 创建存储数据的容器
            item = MyspiderItem()
            # 抽取数据,保存到item中
            item['name'] = node.xpath('./h3/text()').extract()[0]
            item['title'] = node.xpath('./h4/text()').extract()[0]
            item['desc'] = node.xpath('./p/text()').extract()[0]

            # 使用yield返回数据
            yield item
Ejemplo n.º 23
0
 def parse(self, response):
     # 提取数据
     node_list = response.xpath('//*[@id="content"]/table/tr')
     # print(len(node_list))
     for i, node in enumerate(node_list):
         # if i != 5:
         item = MyspiderItem()
         item["date"] = node.xpath("./td[1]/a/text()").extract_first()
         item["link"] = response.urljoin(
             node.xpath("./td[1]/a/@href").extract_first())
         item["situation"] = node.xpath(
             "./td[2]/text()").extract_first().strip()
         item["temperature"] = node.xpath(
             "./td[3]/text()").extract_first().strip()
         item["wind"] = node.xpath("./td[4]/text()").extract_first().strip()
         yield item
Ejemplo n.º 24
0
 def parse_news(self, response):
     item = MyspiderItem()
     #tmp=response.xpath("//div[@class='headlineTxt']/h2[@class='newsTitle']/a/text()|//div[@class='headlineTxt']/p[@class='hbody']/text()").extract()
     tmp = response.xpath("//p[@class='ynDetailText']/text()").extract()
     if len(tmp) == 0:
         tmp=response.xpath("//a[@class='newsLink']/text()").extract()
         if len(tmp)>0:
             if tmp[0] == u"[記事全文]":
                 return item
             else:
                 tmp=response.xpath("//div[@class='headlineTxt']/h2[@class='newsTitle']/a/text()|//div[@class='headlineTxt']/p[@class='hbody']/text()").extract()
         else:
             return item
     
     word_str=""
     s=""
     for w in tmp:
         s+=w.encode("utf-8")
     #word_list = re.findall(u"[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF\n ]+",response.body_as_unicode(),re.U)
     
     url = response.xpath("//li[@class='next']/a/@href").extract()
     if len(url)>0:
         while url[0] != None:
             html = urlopen(url[0])
             soup = BeautifulSoup(html)
             s += soup.find('p',class_='ynDetailText').text.encode("utf-8")
             temp_soup = soup.find('li', class_='next').find('a')
             if temp_soup != None:
                 url[0]=temp_soup['href'].encode("utf-8")
             else:
                 url[0]=None
             
     for w in re.findall(u"[\u4E00-\u9FFF\u3040-\u309F\u30A0-\u30FF]+",s.decode("utf-8"),re.U):
        word_str+=w
     #lines=(line.strip() for line in s.splitlines())
     #chunks=(phrase.strip() for line in lines for phrase in line.split(" "))
     #s='\n'.join(line for line in lines if line)
     item['id'] = self.count
     item['content'] = word_str.encode('utf-8')
     self.count = self.count +1
     
     return item
 
     
     
     
 #88830
Ejemplo n.º 25
0
    def parse(self, response):
        items = []
        for each in response.xpath("//div[@class='li_txt']"):
            item = MyspiderItem()

            # 使用extract()方法返回的都是Unicode字符串
            name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表
            item["name"] = name[0]
            item["title"] = title[0]
            item["info"] = info[0]

            items.append(item)
            yield item
Ejemplo n.º 26
0
    def parse(self, response):
        # scrapy的response对象可以直接进行xpath
        names = response.xpath('//div[@class="tea_con"]//li/div/h3/text()')
        print(names)

        # 获取具体数据文本的方式如下
        # 分组
        li_list = response.xpath('//div[@class="tea_con"]//li')
        for li in li_list:
            # 创建一个数据字典
            #item = {}
            item = MyspiderItem()
            # 利用scrapy封装好的xpath选择器定位元素,并通过extract()或extract_first()来获取结果
            item['name'] = li.xpath('.//h3/text()').extract_first()  # 老师的名字
            item['level'] = li.xpath('.//h4/text()').extract_first()  # 老师的级别
            item['text'] = li.xpath('.//p/text()').extract_first()  # 老师的介绍
            yield item
Ejemplo n.º 27
0
    def parse(self, response):
        items = MyspiderItem()
        items['topics'] = response.xpath(
            r'//div[@class="body board-topics"]/div/@data-topic').extract()
        # print items['topics']
        for topicID in items['topics']:
            started_by = response.xpath(
                '//div[@data-topic="' + topicID +
                '"]/div[@class="topic-first-comment"]/p/text()').extract()
            author = response.xpath(
                '//div[@data-topic="' + topicID +
                '"]/div[@class="topic-first-comment"]/p/a/text()').extract()
            title = response.xpath(
                '//div[@data-topic="' + topicID +
                '"]/div/div/div[@class="raw-topic-title"]/text()').extract()
            link = "https://devtalk.nvidia.com/default/topic/" + topicID + "/" + title[
                0]
            views = response.xpath(
                '//div[@data-topic="' + topicID +
                '"]/div/p[@class="topic-views"]/text()').extract()
            replies = response.xpath(
                '//div[@data-topic="' + topicID +
                '"]/div/p[@class="topic-replies"]/text()').extract()

            # with open(r'./topic.txt', 'a+') as fp:
            #     if len(author):
            #         fp.write(author[0]+';')
            #     fp.write(views.pop()+';')
            #     fp.write(started_by[0]+'\n')
            # fp.write(title[0]+';')
            # fp.write(link+'\n')
            if len(author):
                author_ = author.pop()
            else:
                author_ = 'empty name'
            views_ = views[0].replace(',', '').replace(' Views', '')
            replies_ = replies[0].replace(',', '').replace(' Replies', '')
            yield {
                'author': author_,
                'views': int(views_),
                'replies': int(replies_),
                'title': title.pop(),
                'started_by': started_by.pop(),
                'link': link,
                'topicID': int(topicID)
            }
Ejemplo n.º 28
0
    def data_parse(self, response):

        item = MyspiderItem()
        d = self.response_id_map
        int_realRoadID = int(response.request.body[-2:].replace("=", ""))
        realRoadName = d[int_realRoadID][0]

        jdata = json.loads(response.body)
        events = jdata[u'data']
        if len(events) < 1:
            item['ID'] = int_realRoadID
            item['ROADNAME'] = realRoadName
            item['POSTFROM'] = u'浙江智慧高速'
            item['CONTENT'] = u'目前无路况'
            item['TITLE'] = u'目前无路况'
            yield None
            # return
        else:
            for e in events:
                item['ID'] = int_realRoadID
                item['ROADNAME'] = realRoadName
                item['COLLECTDATE'] = datetime.datetime.today().strftime(
                    '%Y-%m-%d')

                str_passby_stations = e[u'startnodename'] + ' - ' + e[
                    u'endnodename']
                item['EVENTTYPE'] = e[u'eventtype']
                item['DIRECTION'] = (e[u'directionname'] + str_passby_stations)
                item['START_TIME'] = e[u'occtime']
                item['END_TIME'] = datetime.datetime.today().strftime(
                    '%Y-%m-%d')

                # strip content
                ecode_ctnt = (e[u'reportout'].strip().replace(
                    '\n', ' ').replace('\r', '')).encode('utf-8')
                ecode_title = (''.join(e[u'title'].split())).encode('utf-8')
                item['CONTENT'] = ecode_ctnt
                item['TITLE'] = ecode_title
                item[
                    'REF'] = 'http://app.zjzhgs.com/MQTTWechatAPIServer/businessserver/showhighdetail/' + str(
                        int_realRoadID)
                item['POSTDATE'] = e[u'occtime'].encode('utf-8')
                item['POSTFROM'] = u'浙江智慧高速'

                yield item
Ejemplo n.º 29
0
    def parse(self, response):
        selector = Selector(response)
        myspiderItem = MyspiderItem()
        myspiderItem['url'] = response.url
        myspiderItem['size'] = self.format_bytes(int(len(response.body)))

        # for cnblogs
        sel = selector.xpath(
            '//*[@id="mainContent"]/div/div/div/a/@href').extract()
        for url in sel:
            if url.endswith('.html'):
                self.redis_util.insert(url, 1)
                # yield response.follow(url, callback=self.parse)

        for url in selector.xpath(
                '//*[@id="cnblogs_post_body"]/p/a/@href').extract():
            self.redis_util.insert(url, 1)
            # yield response.follow(url, callback=self.parse)

        next_url = response.xpath(
            u'//*[@id="nav_next_page"]/a/@href').extract_first()
        if next_url != None:
            self.redis_util.insert(next_url, 1)
            # yield scrapy.Request(next_url,callback=self.parse)

        next_url = response.xpath(
            u'//*[@id="homepage_bottom_pager"]/div/a/@href').extract()
        for url in next_url:
            if url.endswith("1") == False:
                self.redis_util.insert(url, 1)
                # yield scrapy.Request(url,callback=self.parse)

        # for most web page
        urls = response.css('a::attr(href)').re(r'^/.+?/$')
        for url in urls:
            self.redis_util.insert(url, 1)
            self.redis_util.insert(url, 2)
            # yield response.follow(url, callback=self.parse)

        # yield {
        #     "url": myspiderItem['url'],
        #     "size": myspiderItem['size']
        # }

        yield myspiderItem
Ejemplo n.º 30
0
 def parse(self, response):
     print('*' * 80)
     print(response.meta['id'])
     np = NewsParser()
     result = np.extract_news(response.text)
     if not result:
         return
     item = MyspiderItem()
     item['id'] = response.meta['id']
     item['url'] = response.url
     item['title'] = result['title']
     item['publish_time'] = result['publish_time']
     item['author'] = result['author']
     item['content'] = result['content']
     item['langid'] = langid.classify(item['title'])[0]
     print(item['langid'])
     if item['langid'] in ['zh', 'en']:  #en
         yield item