Example #1
0
def main():
    """The main routine."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=
        ("A crawler for the web version of PTT, the largest online community in Taiwan."
         "Input: board name and page indices (or articla ID)"
         "Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)"
         ))
    parser.add_argument('-b',
                        metavar='BOARD_NAME',
                        help='Board name',
                        required=True)
    parser.add_argument('-o',
                        '--output-dir',
                        help='output directory',
                        default='data')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-i',
                       metavar=('START_INDEX', 'END_INDEX'),
                       type=int,
                       nargs=2,
                       help="Start and end index")
    group.add_argument('-a', metavar='URL', help='article id')
    group.add_argument('-n', metavar='N', help='number of pages to retrieve')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()

    ptt = PttWebCrawler(args.output_dir)
    if args.b:
        board = args.b

    if args.i:
        start = args.i[0]
        if args.i[1] == -1:
            end = ptt.getLastPage(board)
        else:
            end = args.i[1]

        ptt.parse_articles(start, end, board)

    if args.n:
        end = ptt.getLastPage(board)
        start = end - int(args.n) + 1
        ptt.parse_articles(start, end, board)

    if args.a:
        article_id = args.a
        ptt.parse_article(article_id, board)
Example #2
0
 def test_getLastPage(self):
     boards = ['NBA', 'Gossiping', 'b994060work']  # b994060work for 6259fc0 (pull/6)
     for board in boards:
         try:
             _ = crawler.getLastPage(board)
         except:
             self.fail("getLastPage() raised Exception.")
Example #3
0
 def test_getLastPage(self):
     boards = ['NBA', 'Gossiping', 'b994060work']  # b994060work for 6259fc0 (pull/6)
     for board in boards:
         try:
             _ = crawler.getLastPage(board)
         except:
             self.fail("getLastPage() raised Exception.")
Example #4
0
#msg = "Notify from Python \nHave a nice day 張小捲"
# lineNotify(token, msg)

# 發圖片
#msg = "Hello Python"
#picURI = 'C:\\Users\\jonson\Desktop\\ptt_beauty_LineNotify\\a.jpg'
#picURI = 'https://i.imgur.com/eCNafC4.jpg'
# lineNotifyPic(token, msg, picURI)

history_list = []

if __name__ == '__main__':
    token = "ddddddddddddddddd"
    board = "Beauty"
    push_num = 10  #推文數門檻
    last_page = crawler.getLastPage(board)
    index_start = last_page - 1
    index_end = last_page
    filename = '{0}-{1}-{2}.json'.format(board, index_start, index_end)
    crawler(['-b', board, '-i', str(index_start), str(index_end)])
    #    with codecs.open(filename, 'r', encoding='utf-8') as f:
    #        data = json.load(f)
    # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles
    #self.assertEqual(len(data['articles']), 39)

    data = crawler.get(filename)
    os.remove(filename)
    articles = data['articles']
    for a in articles:
        title = a['article_title']
        article_id = a['article_id']