def parse_post(self, response): try: item = PostItem() item['author'] = response.xpath( '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()' )[0].extract().split(' ')[0] item['title'] = response.xpath( '//meta[@property="og:title"]/@content')[0].extract() datetime_str = response.xpath( '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()' )[0].extract() item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') content_elems = response.xpath( '//div[@id="main-content"]' '/text()[' 'not(contains(@class, "push")) and ' 'not(contains(@class, "article-metaline")) and ' 'not(contains(@class, "f2"))' ']') item['content'] = ''.join([c.extract() for c in content_elems]) item['ip'] = response.xpath( '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()' )[0].extract().rstrip().split(' ')[-1:][0] comments = [] total_score = 0 for comment in response.xpath('//div[@class="push"]'): push_tag = comment.css('span.push-tag::text')[0].extract() push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css( 'span.push-content::text')[0].extract() if '推' in push_tag: score = 1 elif '噓' in push_tag: score = -1 else: score = 0 total_score += score comments.append({ 'user': push_user, 'content': push_content, 'score': score }) item['comments'] = comments item['score'] = total_score item['url'] = response.url print('%s %-4s %-14s %s' % (item['date'], item['score'], item['author'], item['title'])) yield item except: return
def parse_post(self, response): item = PostItem() item['author'] = response.xpath( '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()' )[0].extract().split(' ')[0] item['title'] = response.xpath( '//div[@class="article-metaline"]/span[text()="標題"]/following-sibling::span[1]/text()' )[0].extract() datetime_str = response.xpath( '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()' )[0].extract() item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') item['content'] = response.xpath( '//div[@id="main-content"]/text()')[0].extract() item['ip'] = response.xpath( '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()' )[0].extract().rstrip().split(' ')[-1:][0] comments = [] total_score = 0 for comment in response.xpath('//div[@class="push"]'): push_tag = comment.css('span.push-tag::text')[0].extract() push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css('span.push-content::text')[0].extract() if '推' in push_tag: score = 1 elif '噓' in push_tag: score = -1 else: score = 0 total_score += score comments.append({ 'user': push_user, 'content': push_content, 'score': score }) item['comments'] = comments item['score'] = total_score item['url'] = response.url print("%s %-4s %-14s %s" % (item['date'], item['score'], item['author'], item['title'])) yield item
def parse_post(self, response): item = PostItem() item['title'] = response.xpath( '//meta[@property="og:title"]/@content')[0].extract() item['author'] = response.xpath( '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()' )[0].extract().split(' ')[0] datetime_str = response.xpath( '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()' )[0].extract() item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') converter = html2text.HTML2Text() converter.ignore_links = True item['content'] = converter.handle( response.xpath('//div[@id="main-content"]')[0].extract()) comments = [] total_score = 0 for comment in response.xpath('//div[@class="push"]'): push_tag = comment.css('span.push-tag::text')[0].extract() push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css('span.push-content::text')[0].extract() if '推' in push_tag: score = 1 elif '噓' in push_tag: score = -1 else: score = 0 total_score += score comments.append({ 'user': push_user, 'content': push_content, 'score': score }) item['comments'] = comments item['score'] = total_score item['url'] = response.url yield item
def parse_post(self, response): item = PostItem() # item['title'] = response.xpath( # '//meta[@property="og:title"]/@content')[0].extract() # item['author'] = response.xpath( # u'//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[ # 0].extract().split(' ')[0] # datetime_str = response.xpath( # u'//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[ # 0].extract() # item['date'] = datetime.datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') # item['content'] = response.xpath('//div[@id="main-content"]/text()')[0].extract() comments = [] total_score = 0 for comment in response.xpath('//div[@class="push"]'): push_tag = comment.css('span.push-tag::text')[0].extract() push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css('span.push-content::text')[0].extract() if u'推' in push_tag: score = 1 elif u'噓' in push_tag: score = -1 else: score = 0 total_score += score comments.append({'user': push_user, 'content': push_content, 'score': score}) # item['comments'] = comments # item['score'] = total_score # item['url'] = response.url if total_score >= self.total_score_threshold: check_content = response.xpath('//div[@id="main-content"]/text()')[ 0].extract() keyword_count = 0 for i in range(0,len(self.keyword),1): if re.search(self.keyword[i],check_content): keyword_count+=1 else: continue if keyword_count >= self.keyword_count_threshold: incontent = response.xpath('//div[@id="main-content"]') for incontent_href in incontent.css('a::attr(href)'): if re.search('.png',incontent_href.extract()): item['incontent_url'] = incontent_href.extract() item['incontent_url_type'] = '.png' if re.search('.jpg',incontent_href.extract()): item['incontent_url'] = incontent_href.extract() item['incontent_url_type'] = '.jpg' yield item
def parse_post(self, response): try: item = PostItem() item['author'] = response.xpath( '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ')[0] item['title'] = response.xpath( '//div[@class="article-metaline"]/span[text()="標題"]/following-sibling::span[1]/text()')[0].extract() datetime_str = response.xpath( '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[0].extract() item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') # item['content'] = response.xpath( # '//div[@id="main-content"]/text()')[0].extract() item['name'] = re.search('餐廳名稱:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S).group(0). replace('餐廳名稱:', '') if re.search('餐廳名稱:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' item['address'] = re.search('地址:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('地址:', '') if re.search('地址:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' item['phone'] = re.search('電話:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('電話:', '') if re.search('電話:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' item['time'] = re.search('營業時間:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('營業時間:', '') if re.search('營業時間:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' item['money'] = re.search('每人平均價位:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('每人平均價位:', '') if re.search('每人平均價位:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' item['dish'] = re.search('推薦菜色:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract(), re.S).group(0).replace('推薦菜色:', '') if re.search('推薦菜色:(.+?)\n', response.xpath( '//div[@id="main-content"]/text()')[0].extract() , re.S) != None else '' try: item['blog'] = response.xpath( '//div[@id="main-content"]/a/@href')[0].extract() except IndexError: item['blog'] = '' item['ip'] = response.xpath( '//div[@id="main-content"]/span[contains(text(),"發信站: 批踢踢實業坊(ptt.cc)")]/text()')[0].extract().rstrip().split(' ')[-1:][0] comments = [] total_score = 0 for comment in response.xpath('//div[@class="push"]'): push_tag = comment.css('span.push-tag::text')[0].extract() push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css('span.push-content::text')[0].extract() if '推' in push_tag: score = 1 elif '噓' in push_tag: score = -1 else: score = 0 total_score += score comments.append({'user': push_user, 'content': push_content, 'score': score}) item['comments'] = comments item['score'] = total_score item['url'] = response.url # print ( "%s %-4s %-14s %s" % (item['date'],item['score'],item['author'],item['title']) ) print(item) yield item except IndexError as e: print(e) except Exception as e: raise(e)
def parse_post(self, response): # 解析貼文的資訊與內容 try: item = PostItem() authorId = response.xpath('//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ')[0] title = response.xpath('//meta[@property="og:title"]/@content')[0].extract() # 先檢查該貼文日期是否符合我們欲截取的日期區間中 datetime_str = response.xpath('//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[0].extract() # datetime_str 格式範例:Sat Feb 29 11:52:22 2020 # 貼文完整的 datetime(取名為 post_datetime,避免與 datetime() 撞名) post_datetime = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y') print('%s %-14s %s' % (post_datetime, authorId, title)) year = datetime_str.split(' ')[-1] month = datetime_str.split(' ')[1] day = datetime_str.split(' ')[2] # 貼文的日期(只有日期,沒有時間) date_str = f'{year}-{month}-{day}' date = post_datetime.strptime(date_str, '%Y-%b-%d') # 該貼文的日期(轉成 datetime 格式) start_date = post_datetime.strptime(settings.START_DATE, '%Y-%m-%d') # 日期區間的起始日期(轉成 datetime 格式) end_date = post_datetime.strptime(settings.END_DATE, '%Y-%m-%d') # 日期區間的終止日期(轉成 datetime 格式) print(f'本篇貼文日期:{date}', end=',') print(f'規範區間:{start_date}~{end_date}') # 比較時,僅以日期為比較準則,不納入時間 if start_date <= date <= end_date: # 如果該貼文符合日期區間 # 將資料依序輸入至 item 當中 item['authorId'] = authorId name_beforeRegex = response.xpath('//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[0].extract().split(' ') name = ''.join(name_beforeRegex[1:]) item['authorName'] = name[1:-1] item['title'] = title item['publishedTime'] = post_datetime.timestamp() content_elems = response.xpath( '//div[@id="main-content"]' '/text()[' 'not(contains(@class, "push")) and ' 'not(contains(@class, "article-metaline")) and ' 'not(contains(@class, "f2"))' ']') item['content'] = ''.join([c.extract() for c in content_elems]) item['canonicalUrl'] = response.url item['createdTime'] = post_datetime item['updateTime'] = post_datetime # 解析回應 comments = [] for comment in response.xpath('//div[@class="push"]'): push_user = comment.css('span.push-userid::text')[0].extract() push_content = comment.css('span.push-content::text')[0].extract() push_ipdatetime = comment.css('span.push-ipdatetime::text')[0].extract() comment_date = push_ipdatetime.strip().split(' ')[1] # mm/dd comment_time = push_ipdatetime.strip().split(' ')[2] # hh/mm comment_month, comment_day = comment_date.split('/') comment_hour, comment_minute = comment_time.split(':') # 因為 ptt 網頁並沒有記載回應的「年份」,因此暫時以貼文的年份代替 comment_year = year comment_datetime_str = f'{comment_year} {comment_month} {comment_day} {comment_hour}:{comment_minute}' push_time = datetime.strptime(comment_datetime_str, '%Y %m %d %H:%M') comments.append({'commentId': push_user, 'commentContent': push_content, 'commentTime': push_time}) item['comments'] = comments # 記錄爬取的貼文數量 # self._post += 1 # logging.warning(f'已爬取 {self._post} 篇貼文\n') yield item else: # 該貼文不在日期區間內 print(f'{post_datetime} is not in the date range. Please check settings.py\n') return except: return