Beispiel #1
0
    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID
                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
                like = re.findall('赞\[(\d+)\]'.decode('utf8'),
                                  div.extract())  # 点赞数
                transfer = re.findall('转发\[(\d+)\]'.decode('utf8'),
                                      div.extract())  # 转载数
                comment = re.findall('评论\[(\d+)\]'.decode('utf8'),
                                     div.extract())  # 评论数
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具(手机或平台)

                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]'.decode('utf8'))  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others[0].split('来自'.decode('utf8'))
                    tweetsItems["PubTime"] = others[0].replace(u"\xa0", "")
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
                yield tweetsItems
            except Exception as e:
                pass

        url_next = selector.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
            .decode('utf8')).extract()
        if url_next:
            yield Request(url=self.host + url_next[0],
                          callback=self.parse_tweets,
                          dont_filter=True)
Beispiel #2
0
    def parse_tweets(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('(\d+)/profile', response.url)[0]
        divs = selector.xpath('body/div[@class="c" and @id]')
        date_obj = None
        tweets_min_year = MAX_YEAR
        for div in divs:
            try:
                tweetsItems = TweetsItem()
                id = div.xpath('@id').extract_first()  # 微博ID
                content = div.xpath(
                    'div/span[@class="ctt"]//text()').extract()  # 微博内容
                cooridinates = div.xpath('div/a/@href').extract()  # 定位坐标
                like = re.findall('赞\[(\d+)\]'.decode('utf8'),
                                  div.extract())  # 点赞数
                transfer = re.findall('转发\[(\d+)\]'.decode('utf8'),
                                      div.extract())  # 转载数
                comment = re.findall('评论\[(\d+)\]'.decode('utf8'),
                                     div.extract())  # 评论数
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具(手机或平台)

                tweetsItems["_id"] = ID + "-" + id
                tweetsItems["ID"] = ID
                if content:
                    tweetsItems["Content"] = " ".join(content).strip(
                        '[位置]'.decode('utf8'))  # 去掉最后的"[位置]"
                if cooridinates:
                    cooridinates = re.findall('center=([\d.,]+)',
                                              cooridinates[0])
                    if cooridinates:
                        tweetsItems["Co_oridinates"] = cooridinates[0]
                if like:
                    tweetsItems["Like"] = int(like[0])
                if transfer:
                    tweetsItems["Transfer"] = int(transfer[0])
                if comment:
                    tweetsItems["Comment"] = int(comment[0])
                if others:
                    others = others[0].split('来自'.decode('utf8'))
                    date_text = others[0].replace(u"\xa0", "")
                    date_obj = parse_date(date_text)
                    if date_obj.year < MAX_YEAR:
                        tweets_min_year = date_obj.year
                    tweetsItems["PubTime"] = date_to_str(date_obj)
                    if len(others) == 2:
                        tweetsItems["Tools"] = others[1].replace(u"\xa0", "")
                if date_obj and date_obj.year >= MIN_YEAR and date_obj.year <= MAX_YEAR:
                    yield tweetsItems
            except Exception, e:
                pass