def parse_tweets(self, response): """ 抓取微博数据 """ selector = Selector(response) ID = re.findall('(\d+)/profile', response.url)[0] divs = selector.xpath('body/div[@class="c" and @id]') for div in divs: try: tweetsItems = TweetsItem() id = div.xpath('@id').extract_first() # 微博ID content = div.xpath( 'div/span[@class="ctt"]//text()').extract() # 微博内容 cooridinates = div.xpath('div/a/@href').extract() # 定位坐标 like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract()) # 点赞数 transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract()) # 转载数 comment = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract()) # 评论数 others = div.xpath('div/span[@class="ct"]/text()').extract( ) # 求时间和使用工具(手机或平台) tweetsItems["_id"] = ID + "-" + id tweetsItems["ID"] = ID if content: tweetsItems["Content"] = " ".join(content).strip( '[位置]'.decode('utf8')) # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d.,]+)', cooridinates[0]) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others[0].split('来自'.decode('utf8')) tweetsItems["PubTime"] = others[0].replace(u"\xa0", "") if len(others) == 2: tweetsItems["Tools"] = others[1].replace(u"\xa0", "") yield tweetsItems except Exception as e: pass url_next = selector.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if url_next: yield Request(url=self.host + url_next[0], callback=self.parse_tweets, dont_filter=True)
def parse_tweets(self, response): """ 抓取微博数据 """ selector = Selector(response) ID = re.findall('(\d+)/profile', response.url)[0] divs = selector.xpath('body/div[@class="c" and @id]') date_obj = None tweets_min_year = MAX_YEAR for div in divs: try: tweetsItems = TweetsItem() id = div.xpath('@id').extract_first() # 微博ID content = div.xpath( 'div/span[@class="ctt"]//text()').extract() # 微博内容 cooridinates = div.xpath('div/a/@href').extract() # 定位坐标 like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract()) # 点赞数 transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract()) # 转载数 comment = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract()) # 评论数 others = div.xpath('div/span[@class="ct"]/text()').extract( ) # 求时间和使用工具(手机或平台) tweetsItems["_id"] = ID + "-" + id tweetsItems["ID"] = ID if content: tweetsItems["Content"] = " ".join(content).strip( '[位置]'.decode('utf8')) # 去掉最后的"[位置]" if cooridinates: cooridinates = re.findall('center=([\d.,]+)', cooridinates[0]) if cooridinates: tweetsItems["Co_oridinates"] = cooridinates[0] if like: tweetsItems["Like"] = int(like[0]) if transfer: tweetsItems["Transfer"] = int(transfer[0]) if comment: tweetsItems["Comment"] = int(comment[0]) if others: others = others[0].split('来自'.decode('utf8')) date_text = others[0].replace(u"\xa0", "") date_obj = parse_date(date_text) if date_obj.year < MAX_YEAR: tweets_min_year = date_obj.year tweetsItems["PubTime"] = date_to_str(date_obj) if len(others) == 2: tweetsItems["Tools"] = others[1].replace(u"\xa0", "") if date_obj and date_obj.year >= MIN_YEAR and date_obj.year <= MAX_YEAR: yield tweetsItems except Exception, e: pass