Example #1
0
def main():
    """The main routine."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=
        ("A crawler for the web version of PTT, the largest online community in Taiwan."
         "Input: board name and page indices (or articla ID)"
         "Output: BOARD_NAME-START_INDEX-END_INDEX.json (or BOARD_NAME-ID.json)"
         ))
    parser.add_argument('-b',
                        metavar='BOARD_NAME',
                        help='Board name',
                        required=True)
    parser.add_argument('-o',
                        '--output-dir',
                        help='output directory',
                        default='data')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-i',
                       metavar=('START_INDEX', 'END_INDEX'),
                       type=int,
                       nargs=2,
                       help="Start and end index")
    group.add_argument('-a', metavar='URL', help='article id')
    group.add_argument('-n', metavar='N', help='number of pages to retrieve')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()

    ptt = PttWebCrawler(args.output_dir)
    if args.b:
        board = args.b

    if args.i:
        start = args.i[0]
        if args.i[1] == -1:
            end = ptt.getLastPage(board)
        else:
            end = args.i[1]

        ptt.parse_articles(start, end, board)

    if args.n:
        end = ptt.getLastPage(board)
        start = end - int(args.n) + 1
        ptt.parse_articles(start, end, board)

    if args.a:
        article_id = args.a
        ptt.parse_article(article_id, board)
Example #2
0
def crawler(title):
    r = requests.get('https://www.ptt.cc/bbs/' + title + '/index.html',
                     cookies={'over18': '1'},
                     verify=True)
    sp = bs(r.text, 'html.parser')
    end = 999999
    for i in sp.find_all('a'):
        if i.get('href') != None:
            if re.match('/bbs/' + title + '/index(\d)+.html',
                        i.get('href')) != None:
                if int(re.findall('[\d]+', i.get('href'))[0]) != 1:
                    end = int(re.findall('[\d]+', i.get('href'))[0]) + 1
    print(end)
    c = PttWebCrawler(as_lib=True)
    c.parse_articles(1, 1, title)