def test_crawler(self): crawler(['-b', 'PublicServan', '-i', '1', '2']) filename = 'PublicServan-1-2.json' with codecs.open(filename, 'r', encoding='utf-8') as f: data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles self.assertEqual(len(data['articles']), 39) os.remove(filename)
def test_crawler(self): crawler(['-b', 'PublicServan', '-i', '1', '2']) filename = 'PublicServan-1-2.json' with codecs.open(filename, 'r', encoding='utf-8') as f: data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles self.assertEqual(len(data['articles']), 39) data = crawler.get(filename) self.assertEqual(len(data['articles']), 39) os.remove(filename)
def test_crawler(self): self.board = 'PublicServan' crawler(['-b', self.board, '-i', '1', '1']) filenamePrefix = self.board + '-' findOldFile = glob.glob(os.path.join('.', filenamePrefix + '*')) self.assertGreater(len(findOldFile), 0) with codecs.open(findOldFile[0], 'r', encoding='utf-8') as f: data = json.load(f) # 檢查擷取儲存的檔案名稱是否如預期 self.assertEqual(ntpath.basename(findOldFile[0]), self.board + '-' + data['article_id'] + '-' + str(data['message_count']['all']) + '.json') self.assertEqual(data['board'], self.board) # os.remove(filename) for fnPath in findOldFile: os.remove(fnPath)
def test_crawler(self): crawler(['-b', 'PublicServan', '-i', '1', '2']) dirname = 'data/PublicServan' filenames = glob.glob(dirname + '/**/*.json') self.assertEqual(len(filenames), 39) shutil.rmtree(dirname)
token = "KXwzqEGtIp1JEkS5GjqXqRAT0D4BdQQvCNcqOa7ySfz" headers = {"Authorization": "Bearer " + token} message = item['date'] + "\n" + item[u'article_title'] + "\n" + item[u'author'] + "\n" + \ item[u'content'] + "\n" + item[u'content'] + "\n" + item[u'url'] + "\n" #message = item[u'b_作者'] + "\n" payload = {"message": message} r = requests.post(url, headers=headers, params=payload) if __name__ == "__main__": KeyWord = (sys.argv[1]) KeyWord = KeyWord.lower() # (設為負數則以倒數第幾頁計算) ptt = crawler(['-b', 'forsale', '-i', '-1', '2']) #filename = 'forsale--1-5.json' print(ptt.json_filename) with codecs.open(ptt.json_filename, 'r', encoding='utf-8') as f: #with open('forsale-0-2.json', 'r') as f: json_data = json.load(f) for item in json_data['articles']: if (KeyWord in item[u'article_title'].lower() or \ KeyWord in item[u'content'].lower()) and \ (u'看板規則' not in item[u'content'].lower() and \ u'公告' not in item[u'article_title'].lower() and \ u'市集' not in item[u'article_title'].lower()): diff_time = moment.utc( time.asctime(time.localtime(
#msg = "Hello Python" #picURI = 'C:\\Users\\jonson\Desktop\\ptt_beauty_LineNotify\\a.jpg' #picURI = 'https://i.imgur.com/eCNafC4.jpg' # lineNotifyPic(token, msg, picURI) history_list = [] if __name__ == '__main__': token = "ddddddddddddddddd" board = "Beauty" push_num = 10 #推文數門檻 last_page = crawler.getLastPage(board) index_start = last_page - 1 index_end = last_page filename = '{0}-{1}-{2}.json'.format(board, index_start, index_end) crawler(['-b', board, '-i', str(index_start), str(index_end)]) # with codecs.open(filename, 'r', encoding='utf-8') as f: # data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles #self.assertEqual(len(data['articles']), 39) data = crawler.get(filename) os.remove(filename) articles = data['articles'] for a in articles: title = a['article_title'] article_id = a['article_id'] url = a['url'] content = a['content'] push = a['message_count']['push'] if push >= push_num and article_id not in history_list: