import os
import sys
import traceback

boardId = 74

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s:%(message)s',
    datefmt='%m/%d/%Y %I:%M:%S %p')

# Make sure we don't rescrape information already in the DB
memoizer.remember()

logging.info("Beginning scrape of board ID...".format(boardId))
board = memoizer.scrapeBoard(boardId)
logging.info("Found {0} topic pages in board...".format(
    board['num_pages']))
for boardPageNum in range(1, board['num_pages'] + 1):
    logging.info(">Scraping page {0}...".format(boardPageNum))
    topicIds = memoizer.scrapeTopicIds(boardId, boardPageNum)
    for topicId in topicIds:
        logging.info(">>Starting scrape of topic ID {0}...".format(topicId))
        try:
            topic = memoizer.scrapeTopic(topicId)
        except Exception as e:
            print '-'*60
            print "Could not request URL for topic {0}:".format(topicId)
            print traceback.format_exc()
            print '-'*60
            logging.info(">>Could not request URL for topic {0}:".format(
Beispiel #2
0
#add topic id to scrape
f = open('data.txt')
data = f.readlines()
f.close()

for topicId in data:
    logging.info(">Starting scrape of topic ID {0}...".format(topicId))
    try:
        topic = memoizer.scrapeTopic(topicId)
    except Exception as e:
        print '-' * 60
        print "Could not request URL for topic {0}:".format(topicId)
        print traceback.format_exc()
        print '-' * 60
        logging.info(">Could not request URL for topic {0}:".format(topicId))
        continue
    logging.info(">Scraping related board...")
    memoizer.scrapeBoard(topic['board'])
    logging.info(">Found {0} message pages...".format(topic['num_pages'] - 1))
    for pageNum in range(1, topic['num_pages'] + 1):
        logging.info(">>Scraping page {0}...".format(pageNum))
        messages = memoizer.scrapeMessages(topic['id'], pageNum)
        for message in messages:
            if message['member'] > 0:
                memoizer.scrapeMember(message['member'])
        logging.info(">>Done with page {0}.".format(pageNum))
    logging.info(">Done scraping topic ID {0}.".format(topicId))

logging.info("All done.")
logging.info("Made {0} requests in total.".format(bitcointalk.countRequested))
Beispiel #3
0
import traceback
import multiprocessing
from multiprocessing import Pool, freeze_support
from tqdm import tqdm
import argparse
from forumlist import *

if __name__ == '__main__':
    logging.basicConfig(filename="output.log",
                        level=logging.INFO,
                        format='%(asctime)s %(levelname)s:%(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')

    parser = argparse.ArgumentParser(description='Scrape bitcointalk.org')
    parser.add_argument('--boards',
                        nargs='+',
                        dest="boards",
                        help='Set the forum boards you want to collect from',
                        required=True)
    args = parser.parse_args()

    freeze_support()
    # Make sure we don't rescrape information already in the DB
    memoizer.remember()

    for board in args.boards:
        memoizer.scrapeBoard(forumIDs[board])

    logging.info("Made {0} requests in total.".format(
        bitcointalk.countRequested))