def fetch(email='', password='', status=False, gossip=False, album=False, blog=False, refresh_count=False, uid=0): if not email: email = input( "Input renren account email (aka. [email protected]): ") if not password: password = getpass.getpass("Input renren password (will not show): ") prepare_db() config.crawler = Crawler(email, password, Crawler.load_cookie()) uid = uid or config.crawler.uid fetched = fetch_user(uid, fetch_status=status, fetch_gossip=gossip, fetch_album=album, fetch_blog=blog) if not fetched: logger.info('nothing need to fetch, just test login') if fetched or refresh_count: update_fetch_info(uid)
def run(self): self.dataBase = createProdDataBase() self.renrenAccountPool = createProdRenrenAccountPool() for i in range(0, self.ROUND_NUMBER): log.info('>>>>>>>> Main Crawl Thread Round(%s) <<<<<<<<' % (i+1)) if self.dataBase.needMoreStartNode(): startNodeCrawler = StartNodeCrawler(\ self.dataBase, self.renrenAccountPool) startNodeCrawler.startCrawling() self.startMultiThreadCrawling(self.THREAD_NUMBER) #self.startMultiThreadCrawlingWithProxy(1) #manager.startSignleThreadCrawling() try: Crawler.detectStopSignal() except Exception, e: break log.info('>>>>>> Router disconnect PPPoE <<<<<<') router.disconnectPPPoE() time.sleep(2) log.info('>>>>>> Router connect PPPoE <<<<<<') router.connectPPPoE() # Wait for the connection being established. time.sleep(10)
def test(): log.config(GC.LOG_FILE_DIR + 'crawler_test', 'info', 'info') db = createConnection() createTables(db) dropTables(db) createTables(db) pool = renrenaccountpool.createProdRenrenAccountPool() accounts = pool.getAccounts(1) account = accounts[0] global crawler try: crawler = Crawler(db) agent = RenrenAgent(account) agent.login() crawler.setAgent(agent) id = "322601086" crawler.crawl(id, 30) except CrawlerException, e: log.info("Crawler end, reason: " + str(e)) if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL: print "detect int signal" return
def checkforconfig(self, dir): filename = 'config.ini' filepath = os.path.join(dir, filename) if os.path.isfile(filepath) == True: """Pass the configfile for crawling""" print('Config file found...') sleep(2) print('Crawling has began...') Crawler(filepath) else: '''Create a config file here then pass for crawling''' print('No config file') sleep(2) print('Creating new config file with default settings...') config = configparser.ConfigParser() config['DEFAULT'] = {'StandardMedia':'http://www.standardmedia.co.ke/'} with open(filename,mode='w') as configfile: config.write(configfile) print('Config File created...') sleep(2) print('Crawling has began...') Crawler(filepath)
class Getter(): def __init__(self): self.crawler = Crawler() self.redis = RedisClient() def is_over_limet(self): return self.redis.get_count() > POOL_MAX_COUNT def run(self): if not self.is_over_limet(): for crawFunc_label in self.crawler.__CrawFunc__: proxies = self.crawler.get_proxies(crawFunc_label) for proxy in proxies: if not self.redis.exist(proxy): self.redis.add(proxy)
def crawl(self): soup = Crawler.get_soup(self.url) self.url = 'http://www.gsmarena.com/' + soup.find( 'a', text='Read all opinions')['href'] soup = Crawler.get_soup(self.url) review_page_count = int( soup.find('div', { 'id': 'user-pages' }).findAll('a')[-2].getText()) url = self.url for i in range(2, review_page_count): reviews = soup.findAll('p', {'class': 'uopin'}) for r in reviews: for tag in r.findAll('a'): tag.replaceWith('') for tag in r.findAll('span'): tag.replaceWith('') print(r.getText().strip()) url = self.url.replace('.php', 'p%d.php' % i) soup = Crawler.get_soup(url)
def __init__(self): self.crawler = Crawler() self.redis = RedisClient()
def prepare_crawler(args): from crawl.crawler import Crawler config.crawler = Crawler(args.email, args.password, Crawler.load_cookie()) return config.crawler
from crawl.crawler import Crawler start_url = "http://stzb.163.com/card_list.html" demo = Crawler(start_url) demo.startCrawl()
def detectSignal(a, b): print "INT Signal detect" Crawler.setStopSignal()
def run(self): log.info('>>>>>> Thread %s start. <<<<<<' % self.threadId) crawler = Crawler(self.dataBase) dataBase = self.dataBase agent = None account = None startNode = None startNodeRowId = None try: while True: # Prepare for agent, account and startnode. if not startNode: startNode, startNodeRowId = dataBase.getStartNode() log.info('Thread %s, startnode: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) if not startNode or not startNodeRowId: # No avaliable start node, exit crawling. log.error( 'No start node for thread %s, exit crawling.' %\ (self.threadId, )) break if not agent or not account: agent,account = self.getAgentWithAccount() if not agent or not account: # No avaliable account, exit crawling. log.warning( 'No avaliable agent for thread %s, exit crawling.' %\ (self.threadId, )) break # One crawling process. crawler.setAgent(agent) try: crawler.crawl(startNode) except CrawlerException, e: log.info('Thread %s gets exception: %s' %\ (self.threadId, str(e))) if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL: log.info("Thread " + str(self.threadId) +\ " stop crawling because of stop signal.") break if e.errorCode ==\ CrawlerErrorCode.GET_EXPANDING_NODE_FAILED or\ e.errorCode == CrawlerErrorCode.EXPAND_EXPANDED_NODE or\ e.errorCode == CrawlerErrorCode.NO_NODE_TO_EXPAND: # Start node's bad. log.warning('Thread %s, bad start node: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) dataBase.deleteFromStartList(startNode) startNode = startNodeRowId = None if e.errorCode == CrawlerErrorCode.REQUEST_FAILED: # Still start node's bad. # TODO: Implement invalid usernode test support in # database to change it. log.warning('Thread %s, bad start node: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) dataBase.deleteFromStartList(startNode) startNode = startNodeRowId = None if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT: # Use a new accout account.finishUsing() account = agent = None finally: # The start node change every time crawler.epand() called. # So the start node can not be reused when exception happen. # We need to release it and use a new one. if startNodeRowId: dataBase.releaseStartNode(startNodeRowId) startNode = startNodeRowId = None
def prepare_crawler(args): from crawl.crawler import Crawler config.crawler = Crawler(args.email, args.password) return config.crawler