def main(): """The main routine.""" parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description= ("A crawler for the web version of PTT, the largest online community in Taiwan." "Input: board name and page indices (or articla ID)" "Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)" )) parser.add_argument('-b', metavar='BOARD_NAME', help='Board name', required=True) parser.add_argument('-o', '--output-dir', help='output directory', default='data') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-i', metavar=('START_INDEX', 'END_INDEX'), type=int, nargs=2, help="Start and end index") group.add_argument('-a', metavar='URL', help='article id') group.add_argument('-n', metavar='N', help='number of pages to retrieve') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() ptt = PttWebCrawler(args.output_dir) if args.b: board = args.b if args.i: start = args.i[0] if args.i[1] == -1: end = ptt.getLastPage(board) else: end = args.i[1] ptt.parse_articles(start, end, board) if args.n: end = ptt.getLastPage(board) start = end - int(args.n) + 1 ptt.parse_articles(start, end, board) if args.a: article_id = args.a ptt.parse_article(article_id, board)
def crawler(title): r = requests.get('https://www.ptt.cc/bbs/' + title + '/index.html', cookies={'over18': '1'}, verify=True) sp = bs(r.text, 'html.parser') end = 999999 for i in sp.find_all('a'): if i.get('href') != None: if re.match('/bbs/' + title + '/index(\d)+.html', i.get('href')) != None: if int(re.findall('[\d]+', i.get('href'))[0]) != 1: end = int(re.findall('[\d]+', i.get('href'))[0]) + 1 print(end) c = PttWebCrawler(as_lib=True) c.parse_articles(1, 1, title)
def test_getLastPage(self): boards = ['NBA', 'Gossiping', 'b994060work'] # b994060work for 6259fc0 (pull/6) for board in boards: try: _ = crawler.getLastPage(board) except: self.fail("getLastPage() raised Exception.")
def test_getLastPage(self): boards = ['NBA', 'Gossiping', 'b994060work'] # b994060work for 6259fc0 (pull/6) for board in boards: try: _ = crawler.getLastPage(board) except: self.fail("getLastPage() raised Exception.")
def test_parse_with_push_without_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html' self.article_id = 'M.1433091897.A.1C5' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_without_metalines(self): self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html' self.article_id = 'M.1432438578.A.4B0' self.board = 'NBA' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_without_metalines(self): self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html' self.article_id = 'M.1432438578.A.4B0' self.board = 'NBA' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_with_push_without_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html' self.article_id = 'M.1433091897.A.1C5' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_with_structured_push_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html' self.article_id = 'M.1119222660.A.94E' self.board = 'Gossiping' jsondata = crawler.parse(self.link, self.article_id, self.board) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse(self): self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html' self.article_id = 'M.1409529482.A.9D3' self.board = 'PublicServan' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) self.assertEqual(jsondata['message_conut']['count'], 57)
def test_parse(self): self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html' self.article_id = 'M.1409529482.A.9D3' self.board = 'PublicServan' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) self.assertEqual(jsondata['message_count']['count'], 57)
def test_crawler(self): crawler(['-b', 'PublicServan', '-i', '1', '2']) filename = 'PublicServan-1-2.json' with codecs.open(filename, 'r', encoding='utf-8') as f: data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles self.assertEqual(len(data['articles']), 39) data = crawler.get(filename) self.assertEqual(len(data['articles']), 39) os.remove(filename)
def test_crawler(self): crawler(['-b', 'PublicServan', '-i', '1', '2']) filename = 'PublicServan-1-2.json' with codecs.open(filename, 'r', encoding='utf-8') as f: data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles self.assertEqual(len(data['articles']), 39) data = crawler.get(filename) self.assertEqual(len(data['articles']), 39) os.remove(filename)
def test_parse_with_structured_push_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html' self.article_id = 'M.1119222660.A.94E' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) isCatched = False for msg in jsondata['messages']: if u'http://tinyurl.com/4arw47s' in msg['push_content']: isCatched = True self.assertTrue(isCatched)
def test_parse_with_structured_push_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html' self.article_id = 'M.1119222660.A.94E' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) isCatched = False for msg in jsondata['messages']: if u'http://tinyurl.com/4arw47s' in msg['push_content']: isCatched = True self.assertTrue(isCatched)
def test_parse(self): self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html' self.article_id = 'M.1409529482.A.9D3' self.board = 'PublicServan' jsondata = crawler.parse(self.link, self.article_id, self.board) self.assertIn('article_title', jsondata) self.assertIn('content', jsondata) self.assertIn('date', jsondata) self.assertIn('url', jsondata) self.assertIn('ip', jsondata) self.assertIn('board', jsondata) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) self.assertEqual(jsondata['message_count']['count'], 57)
def run_ptt_give_crawler(): board = "give" # last_page = PttWebCrawler.getLastPage(board) # start_no, end_no = last_page - 1, last_page start_no, end_no = 0, 0 loadfile_before = f"{board}-{start_no}-{end_no}-save.json" # set default articles_before = [] articles_after = [] # read file if os.path.isfile(loadfile_before): with open(loadfile_before, "r") as reader: remove = False try: jf_defore = json.loads(reader.read()) articles_before = jf_defore["articles"] print_log(f"articles_before len({len(articles_before)})") except JSONDecodeError as e: print_exception(e) remove = True if remove: os.remove(loadfile_before) loadfile_after = PttWebCrawler(as_lib=True).parse_articles(start_no, end_no, board) if os.path.isfile(loadfile_after): jf_after = None with open(loadfile_after, "r") as reader: try: jf_after = json.loads(reader.read()) articles_after = jf_after["articles"] print_log(f"articles_after len({len(articles_after)})") except JSONDecodeError as e: print_exception(e) print_log(jf_after, "jf_after") raise os.replace(loadfile_after, loadfile_before) # diff_v1(articles_before, articles_after) diff_v2(articles_after) return True
#msg = "Notify from Python \nHave a nice day 張小捲" # lineNotify(token, msg) # 發圖片 #msg = "Hello Python" #picURI = 'C:\\Users\\jonson\Desktop\\ptt_beauty_LineNotify\\a.jpg' #picURI = 'https://i.imgur.com/eCNafC4.jpg' # lineNotifyPic(token, msg, picURI) history_list = [] if __name__ == '__main__': token = "ddddddddddddddddd" board = "Beauty" push_num = 10 #推文數門檻 last_page = crawler.getLastPage(board) index_start = last_page - 1 index_end = last_page filename = '{0}-{1}-{2}.json'.format(board, index_start, index_end) crawler(['-b', board, '-i', str(index_start), str(index_end)]) # with codecs.open(filename, 'r', encoding='utf-8') as f: # data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles #self.assertEqual(len(data['articles']), 39) data = crawler.get(filename) os.remove(filename) articles = data['articles'] for a in articles: title = a['article_title'] article_id = a['article_id']
def run_ptt_give_crawler_v2(): board = "give" start_no, end_no = 0, 0 PttWebCrawler(as_lib=True).crawl_articles(start_no, end_no, board, on_crawled)
def run_ptt_give_crawler_v3(): for board, watch in WATCH_CONFIG.items(): start_no, end_no = 0, 0 PttWebCrawler(as_lib=True).crawl_articles( start_no, end_no, board, partial(on_crawled_v2, watch=watch) )