Ejemplo n.º 1
0
    def parse_post(self, response):
        try:
            item = PostItem()
            item['author'] = response.xpath(
                '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()'
            )[0].extract().split(' ')[0]
            item['title'] = response.xpath(
                '//meta[@property="og:title"]/@content')[0].extract()
            datetime_str = response.xpath(
                '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()'
            )[0].extract()
            item['date'] = datetime.strptime(datetime_str,
                                             '%a %b %d %H:%M:%S %Y')
            content_elems = response.xpath(
                '//div[@id="main-content"]'
                '/text()['
                'not(contains(@class, "push")) and '
                'not(contains(@class, "article-metaline")) and '
                'not(contains(@class, "f2"))'
                ']')
            item['content'] = ''.join([c.extract() for c in content_elems])
            item['ip'] = response.xpath(
                '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()'
            )[0].extract().rstrip().split(' ')[-1:][0]

            comments = []
            total_score = 0
            for comment in response.xpath('//div[@class="push"]'):
                push_tag = comment.css('span.push-tag::text')[0].extract()
                push_user = comment.css('span.push-userid::text')[0].extract()
                push_content = comment.css(
                    'span.push-content::text')[0].extract()

                if '推' in push_tag:
                    score = 1
                elif '噓' in push_tag:
                    score = -1
                else:
                    score = 0

                total_score += score
                comments.append({
                    'user': push_user,
                    'content': push_content,
                    'score': score
                })

            item['comments'] = comments
            item['score'] = total_score
            item['url'] = response.url
            print('%s  %-4s %-14s %s' %
                  (item['date'], item['score'], item['author'], item['title']))
            yield item
        except:
            return
Ejemplo n.º 2
0
    def parse_post(self, response):
        item = PostItem()
        item['author'] = response.xpath(
            '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()'
        )[0].extract().split(' ')[0]
        item['title'] = response.xpath(
            '//div[@class="article-metaline"]/span[text()="標題"]/following-sibling::span[1]/text()'
        )[0].extract()
        datetime_str = response.xpath(
            '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()'
        )[0].extract()
        item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')
        item['content'] = response.xpath(
            '//div[@id="main-content"]/text()')[0].extract()
        item['ip'] = response.xpath(
            '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()'
        )[0].extract().rstrip().split(' ')[-1:][0]
        comments = []
        total_score = 0
        for comment in response.xpath('//div[@class="push"]'):
            push_tag = comment.css('span.push-tag::text')[0].extract()
            push_user = comment.css('span.push-userid::text')[0].extract()
            push_content = comment.css('span.push-content::text')[0].extract()

            if '推' in push_tag:
                score = 1
            elif '噓' in push_tag:
                score = -1
            else:
                score = 0

            total_score += score
            comments.append({
                'user': push_user,
                'content': push_content,
                'score': score
            })

        item['comments'] = comments
        item['score'] = total_score
        item['url'] = response.url
        print("%s  %-4s %-14s %s" %
              (item['date'], item['score'], item['author'], item['title']))
        yield item
Ejemplo n.º 3
0
    def parse_post(self, response):
        item = PostItem()
        item['title'] = response.xpath(
            '//meta[@property="og:title"]/@content')[0].extract()
        item['author'] = response.xpath(
            '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()'
        )[0].extract().split(' ')[0]
        datetime_str = response.xpath(
            '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()'
        )[0].extract()
        item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')

        converter = html2text.HTML2Text()
        converter.ignore_links = True
        item['content'] = converter.handle(
            response.xpath('//div[@id="main-content"]')[0].extract())

        comments = []
        total_score = 0
        for comment in response.xpath('//div[@class="push"]'):
            push_tag = comment.css('span.push-tag::text')[0].extract()
            push_user = comment.css('span.push-userid::text')[0].extract()
            push_content = comment.css('span.push-content::text')[0].extract()

            if '推' in push_tag:
                score = 1
            elif '噓' in push_tag:
                score = -1
            else:
                score = 0

            total_score += score

            comments.append({
                'user': push_user,
                'content': push_content,
                'score': score
            })

        item['comments'] = comments
        item['score'] = total_score
        item['url'] = response.url

        yield item
Ejemplo n.º 4
0
    def parse_post(self, response):
     
        item = PostItem()      
#        item['title'] =  response.xpath(
#            '//meta[@property="og:title"]/@content')[0].extract() 
#        item['author'] = response.xpath(
#            u'//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[
#                0].extract().split(' ')[0]
#        datetime_str = response.xpath(
#            u'//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[
#                0].extract()
#        item['date'] = datetime.datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')

#        item['content'] = response.xpath('//div[@id="main-content"]/text()')[0].extract()

        comments = []
        total_score = 0
        for comment in response.xpath('//div[@class="push"]'):
            push_tag = comment.css('span.push-tag::text')[0].extract()
            push_user = comment.css('span.push-userid::text')[0].extract()
            push_content = comment.css('span.push-content::text')[0].extract()

            if u'推' in push_tag:
                score = 1
            elif u'噓' in push_tag:
                score = -1
            else:
                score = 0

            total_score += score

            comments.append({'user': push_user,
                             'content': push_content,
                             'score': score})

 #      item['comments'] = comments
 #       item['score'] = total_score
 #       item['url'] = response.url
        
        if total_score >=  self.total_score_threshold:
            check_content = response.xpath('//div[@id="main-content"]/text()')[
            0].extract()
            
            keyword_count = 0   
            for i in range(0,len(self.keyword),1):
                if re.search(self.keyword[i],check_content):
                   keyword_count+=1
                else:
                   continue
            if keyword_count >= self.keyword_count_threshold:
                incontent = response.xpath('//div[@id="main-content"]')
                for incontent_href in incontent.css('a::attr(href)'):
                  if re.search('.png',incontent_href.extract()):  
                      item['incontent_url'] = incontent_href.extract()
                      item['incontent_url_type'] = '.png'
                      
                  if re.search('.jpg',incontent_href.extract()):
                       item['incontent_url'] = incontent_href.extract()
                       item['incontent_url_type'] = '.jpg'

        yield item
Ejemplo n.º 5
0
    def parse_post(self, response):
        try:
            item = PostItem()
            item['author'] = response.xpath(
                '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ')[0] 
            item['title'] = response.xpath(
                '//div[@class="article-metaline"]/span[text()="標題"]/following-sibling::span[1]/text()')[0].extract()
            datetime_str = response.xpath(
                '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[0].extract()
            item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')
            # item['content'] = response.xpath(
            #     '//div[@id="main-content"]/text()')[0].extract()

            item['name'] = re.search('餐廳名稱:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S).group(0). replace('餐廳名稱:', '') if re.search('餐廳名稱:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''
            
            item['address'] = re.search('地址:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('地址:', '') if re.search('地址:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''

            item['phone'] = re.search('電話:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('電話:', '') if re.search('電話:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''

            item['time'] = re.search('營業時間:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('營業時間:', '') if re.search('營業時間:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''

            item['money'] = re.search('每人平均價位:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('每人平均價位:', '') if re.search('每人平均價位:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''

            item['dish'] = re.search('推薦菜色:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('推薦菜色:', '') if re.search('推薦菜色:(.+?)\n', response.xpath(
                '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else ''
            try:
                item['blog'] = response.xpath(
                    '//div[@id="main-content"]/a/@href')[0].extract()
            except IndexError:
                item['blog'] = ''

            item['ip'] = response.xpath(
                '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()')[0].extract().rstrip().split(' ')[-1:][0]
            comments = []
            total_score = 0
            for comment in response.xpath('//div[@class="push"]'):
                push_tag = comment.css('span.push-tag::text')[0].extract()
                push_user = comment.css('span.push-userid::text')[0].extract()
                push_content = comment.css('span.push-content::text')[0].extract()

                if '推' in push_tag:
                    score = 1
                elif '噓' in push_tag:
                    score = -1
                else:
                    score = 0

                total_score += score
                comments.append({'user': push_user,
                                 'content': push_content,
                                 'score': score})

            item['comments'] = comments
            item['score'] = total_score
            item['url'] = response.url
            # print ( "%s  %-4s %-14s %s" % (item['date'],item['score'],item['author'],item['title']) )
            print(item)
            yield item
        except IndexError as e:
            print(e)
        except Exception as e:
            raise(e)
Ejemplo n.º 6
0
    def parse_post(self, response):
        # 解析貼文的資訊與內容
        try:
            item = PostItem()

            authorId = response.xpath('//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ')[0]
            title = response.xpath('//meta[@property="og:title"]/@content')[0].extract()
            
            # 先檢查該貼文日期是否符合我們欲截取的日期區間中
            datetime_str = response.xpath('//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[0].extract()
            
            # datetime_str 格式範例:Sat Feb 29 11:52:22 2020
            # 貼文完整的 datetime(取名為 post_datetime,避免與 datetime() 撞名) 
            post_datetime = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')

            print('%s %-14s %s' % (post_datetime, authorId, title))
            
            year = datetime_str.split(' ')[-1]
            month = datetime_str.split(' ')[1]
            day = datetime_str.split(' ')[2]

            # 貼文的日期(只有日期,沒有時間)
            date_str = f'{year}-{month}-{day}'
            date = post_datetime.strptime(date_str, '%Y-%b-%d') # 該貼文的日期(轉成 datetime 格式)
            start_date = post_datetime.strptime(settings.START_DATE, '%Y-%m-%d') # 日期區間的起始日期(轉成 datetime 格式)
            end_date = post_datetime.strptime(settings.END_DATE, '%Y-%m-%d') # 日期區間的終止日期(轉成 datetime 格式)

            print(f'本篇貼文日期:{date}', end=',')
            print(f'規範區間:{start_date}~{end_date}')

            # 比較時,僅以日期為比較準則,不納入時間
            if start_date <= date <= end_date:
                # 如果該貼文符合日期區間

                # 將資料依序輸入至 item 當中
                item['authorId'] = authorId
                name_beforeRegex = response.xpath('//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ')
                name = ''.join(name_beforeRegex[1:])
                item['authorName'] = name[1:-1]
                item['title'] = title
                
                item['publishedTime'] = post_datetime.timestamp()
                content_elems = response.xpath(
                    '//div[@id="main-content"]'
                    '/text()['
                    'not(contains(@class, "push")) and '
                    'not(contains(@class, "article-metaline")) and '
                    'not(contains(@class, "f2"))'
                    ']')
                item['content'] = ''.join([c.extract() for c in content_elems])
                item['canonicalUrl'] = response.url
                item['createdTime'] = post_datetime
                item['updateTime'] = post_datetime

                # 解析回應
                comments = []
                for comment in response.xpath('//div[@class="push"]'):
                    push_user = comment.css('span.push-userid::text')[0].extract()
                    push_content = comment.css('span.push-content::text')[0].extract()
                    push_ipdatetime = comment.css('span.push-ipdatetime::text')[0].extract()
                    comment_date = push_ipdatetime.strip().split(' ')[1] # mm/dd
                    comment_time = push_ipdatetime.strip().split(' ')[2] # hh/mm
                    comment_month, comment_day = comment_date.split('/')
                    comment_hour, comment_minute = comment_time.split(':')
                    # 因為 ptt 網頁並沒有記載回應的「年份」,因此暫時以貼文的年份代替
                    comment_year = year
                    comment_datetime_str = f'{comment_year} {comment_month} {comment_day} {comment_hour}:{comment_minute}'
                    push_time = datetime.strptime(comment_datetime_str, '%Y %m %d %H:%M')

                    comments.append({'commentId': push_user,
                                    'commentContent': push_content,
                                    'commentTime': push_time})

                item['comments'] = comments

                # 記錄爬取的貼文數量
                # self._post += 1
                # logging.warning(f'已爬取 {self._post} 篇貼文\n')
                
                yield item
            else:
                # 該貼文不在日期區間內
                print(f'{post_datetime} is not in the date range. Please check settings.py\n')
                return
            
        except:
            return