Example #1
0
def main():
    """The main routine."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=
        ("A crawler for the web version of PTT, the largest online community in Taiwan."
         "Input: board name and page indices (or articla ID)"
         "Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)"
         ))
    parser.add_argument('-b',
                        metavar='BOARD_NAME',
                        help='Board name',
                        required=True)
    parser.add_argument('-o',
                        '--output-dir',
                        help='output directory',
                        default='data')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-i',
                       metavar=('START_INDEX', 'END_INDEX'),
                       type=int,
                       nargs=2,
                       help="Start and end index")
    group.add_argument('-a', metavar='URL', help='article id')
    group.add_argument('-n', metavar='N', help='number of pages to retrieve')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()

    ptt = PttWebCrawler(args.output_dir)
    if args.b:
        board = args.b

    if args.i:
        start = args.i[0]
        if args.i[1] == -1:
            end = ptt.getLastPage(board)
        else:
            end = args.i[1]

        ptt.parse_articles(start, end, board)

    if args.n:
        end = ptt.getLastPage(board)
        start = end - int(args.n) + 1
        ptt.parse_articles(start, end, board)

    if args.a:
        article_id = args.a
        ptt.parse_article(article_id, board)
Example #2
0
def crawler(title):
    r = requests.get('https://www.ptt.cc/bbs/' + title + '/index.html',
                     cookies={'over18': '1'},
                     verify=True)
    sp = bs(r.text, 'html.parser')
    end = 999999
    for i in sp.find_all('a'):
        if i.get('href') != None:
            if re.match('/bbs/' + title + '/index(\d)+.html',
                        i.get('href')) != None:
                if int(re.findall('[\d]+', i.get('href'))[0]) != 1:
                    end = int(re.findall('[\d]+', i.get('href'))[0]) + 1
    print(end)
    c = PttWebCrawler(as_lib=True)
    c.parse_articles(1, 1, title)
Example #3
0
 def test_getLastPage(self):
     boards = ['NBA', 'Gossiping', 'b994060work']  # b994060work for 6259fc0 (pull/6)
     for board in boards:
         try:
             _ = crawler.getLastPage(board)
         except:
             self.fail("getLastPage() raised Exception.")
Example #4
0
 def test_getLastPage(self):
     boards = ['NBA', 'Gossiping', 'b994060work']  # b994060work for 6259fc0 (pull/6)
     for board in boards:
         try:
             _ = crawler.getLastPage(board)
         except:
             self.fail("getLastPage() raised Exception.")
Example #5
0
    def test_parse_with_push_without_contents(self):
        self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html'
        self.article_id = 'M.1433091897.A.1C5'
        self.board = 'Gossiping'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
Example #6
0
    def test_parse_without_metalines(self):
        self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html'
        self.article_id = 'M.1432438578.A.4B0'
        self.board = 'NBA'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
Example #7
0
    def test_parse_without_metalines(self):
        self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html'
        self.article_id = 'M.1432438578.A.4B0'
        self.board = 'NBA'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
Example #8
0
    def test_parse_with_push_without_contents(self):
        self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html'
        self.article_id = 'M.1433091897.A.1C5'
        self.board = 'Gossiping'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
Example #9
0
    def test_parse_with_structured_push_contents(self):
        self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html'
        self.article_id = 'M.1119222660.A.94E'
        self.board = 'Gossiping'

        jsondata = crawler.parse(self.link, self.article_id, self.board)
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
Example #10
0
    def test_parse(self):
        self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html'
        self.article_id = 'M.1409529482.A.9D3'
        self.board = 'PublicServan'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
        self.assertEqual(jsondata['message_conut']['count'], 57)
Example #11
0
    def test_parse(self):
        self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html'
        self.article_id = 'M.1409529482.A.9D3'
        self.board = 'PublicServan'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
        self.assertEqual(jsondata['message_count']['count'], 57)
Example #12
0
 def test_crawler(self):
     crawler(['-b', 'PublicServan', '-i', '1', '2'])
     filename = 'PublicServan-1-2.json'
     with codecs.open(filename, 'r', encoding='utf-8') as f:
         data = json.load(f)
         # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles
         self.assertEqual(len(data['articles']), 39)
     data = crawler.get(filename)
     self.assertEqual(len(data['articles']), 39)
     os.remove(filename)
Example #13
0
 def test_crawler(self):
     crawler(['-b', 'PublicServan', '-i', '1', '2'])
     filename = 'PublicServan-1-2.json'
     with codecs.open(filename, 'r', encoding='utf-8') as f:
         data = json.load(f)
         # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles
         self.assertEqual(len(data['articles']), 39)
     data = crawler.get(filename)
     self.assertEqual(len(data['articles']), 39)
     os.remove(filename)
Example #14
0
    def test_parse_with_structured_push_contents(self):
        self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html'
        self.article_id = 'M.1119222660.A.94E'
        self.board = 'Gossiping'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
        isCatched = False
        for msg in jsondata['messages']:
            if u'http://tinyurl.com/4arw47s' in msg['push_content']:
                isCatched = True
        self.assertTrue(isCatched)
Example #15
0
    def test_parse_with_structured_push_contents(self):
        self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html'
        self.article_id = 'M.1119222660.A.94E'
        self.board = 'Gossiping'

        jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
        isCatched = False
        for msg in jsondata['messages']:
            if u'http://tinyurl.com/4arw47s' in msg['push_content']:
                isCatched = True
        self.assertTrue(isCatched)
Example #16
0
    def test_parse(self):
        self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html'
        self.article_id = 'M.1409529482.A.9D3'
        self.board = 'PublicServan'

        jsondata = crawler.parse(self.link, self.article_id, self.board)
        self.assertIn('article_title', jsondata)
        self.assertIn('content', jsondata)
        self.assertIn('date', jsondata)
        self.assertIn('url', jsondata)
        self.assertIn('ip', jsondata)
        self.assertIn('board', jsondata)

        self.assertEqual(jsondata['article_id'], self.article_id)
        self.assertEqual(jsondata['board'], self.board)
        self.assertEqual(jsondata['message_count']['count'], 57)
def run_ptt_give_crawler():
    board = "give"
    # last_page = PttWebCrawler.getLastPage(board)
    # start_no, end_no = last_page - 1, last_page
    start_no, end_no = 0, 0
    loadfile_before = f"{board}-{start_no}-{end_no}-save.json"

    # set default
    articles_before = []
    articles_after = []

    # read file
    if os.path.isfile(loadfile_before):
        with open(loadfile_before, "r") as reader:
            remove = False
            try:
                jf_defore = json.loads(reader.read())
                articles_before = jf_defore["articles"]
                print_log(f"articles_before len({len(articles_before)})")
            except JSONDecodeError as e:
                print_exception(e)
                remove = True
        if remove:
            os.remove(loadfile_before)

    loadfile_after = PttWebCrawler(as_lib=True).parse_articles(start_no, end_no, board)

    if os.path.isfile(loadfile_after):
        jf_after = None
        with open(loadfile_after, "r") as reader:
            try:
                jf_after = json.loads(reader.read())
                articles_after = jf_after["articles"]
                print_log(f"articles_after len({len(articles_after)})")
            except JSONDecodeError as e:
                print_exception(e)
                print_log(jf_after, "jf_after")
                raise
        os.replace(loadfile_after, loadfile_before)

    # diff_v1(articles_before, articles_after)
    diff_v2(articles_after)
    return True
Example #18
0
#msg = "Notify from Python \nHave a nice day 張小捲"
# lineNotify(token, msg)

# 發圖片
#msg = "Hello Python"
#picURI = 'C:\\Users\\jonson\Desktop\\ptt_beauty_LineNotify\\a.jpg'
#picURI = 'https://i.imgur.com/eCNafC4.jpg'
# lineNotifyPic(token, msg, picURI)

history_list = []

if __name__ == '__main__':
    token = "ddddddddddddddddd"
    board = "Beauty"
    push_num = 10  #推文數門檻
    last_page = crawler.getLastPage(board)
    index_start = last_page - 1
    index_end = last_page
    filename = '{0}-{1}-{2}.json'.format(board, index_start, index_end)
    crawler(['-b', board, '-i', str(index_start), str(index_end)])
    #    with codecs.open(filename, 'r', encoding='utf-8') as f:
    #        data = json.load(f)
    # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles
    #self.assertEqual(len(data['articles']), 39)

    data = crawler.get(filename)
    os.remove(filename)
    articles = data['articles']
    for a in articles:
        title = a['article_title']
        article_id = a['article_id']
def run_ptt_give_crawler_v2():
    board = "give"
    start_no, end_no = 0, 0
    PttWebCrawler(as_lib=True).crawl_articles(start_no, end_no, board, on_crawled)
def run_ptt_give_crawler_v3():
    for board, watch in WATCH_CONFIG.items():
        start_no, end_no = 0, 0
        PttWebCrawler(as_lib=True).crawl_articles(
            start_no, end_no, board, partial(on_crawled_v2, watch=watch)
        )