def __init__( self, db_info, id, url_structure ): # initiate the thread Thread.__init__(self) # set the thread ID self.id = id # save the database connection and the class self.dbc = database() self.dbc = self.dbc.connect( db_info ) self.db = database() # compile and save a regular expression that will match the torrent information URL self.torrent_url_pattern = re.compile( r'<div class="detName"><a href="([a-zA-Z0-9\_\-\(\)/\.\[\]]+)"', re.I ) # save the page url structure ( e.g. http://thepiratebay.org/browse/101/TPAGE/3 ) self.url_structure = url_structure # set the current page as the first page self.first_page() # pages to parse before we set this to sleep to stop the program self.max_parse = 100 self.num_parsed = 0
def __init__( self, db_info, url ): threading.Thread.__init__( self ) self.dbc = database() self.dbc = self.dbc.connect( db_info ) self.db = database() self.download_url_pattern = re.compile( r'<a href="http://([a-zA-Z0-9\_\-\(\)/\.\[\]]+).TPB.torrent" title="', re.I ) self.url = url
import re
import random
config = ConfigParser.RawConfigParser() config.read('./config/database.cfg') config.read('./config/bot.cfg') db_info = [ config.get( 'database', 'host' ), config.get( 'database', 'port' ), config.get( 'database', 'user' ), config.get( 'database', 'passw' ), config.get( 'database', 'name' ), ] max_threads = int( config.get( 'bot', 'max_threads' ) ) db = database() db = db.connect( db_info ) infinite = 1 while infinite == 1: count = 0 threads = [] rows = db.to_crawl.find( { "hash": 0 } ).limit( max_threads ) if rows.count( 1 ) < max_threads: print "sleeping" time.sleep( 120 ) else: for i in range( rows.count( 1 ) - 1 ): thread = get_site( rows[i]['site'] )( db_info, rows[i]['info_url'] ) threads.append( thread )