Python Crawler Exemples, crawl.crawler.Crawler Python Exemples

Exemple #1

0

Afficher le fichier

def fetch(email='',
          password='',
          status=False,
          gossip=False,
          album=False,
          blog=False,
          refresh_count=False,
          uid=0):
    if not email:
        email = input(
            "Input renren account email (aka. [email protected]): ")
    if not password:
        password = getpass.getpass("Input renren password (will not show): ")

    prepare_db()

    config.crawler = Crawler(email, password, Crawler.load_cookie())
    uid = uid or config.crawler.uid

    fetched = fetch_user(uid,
                         fetch_status=status,
                         fetch_gossip=gossip,
                         fetch_album=album,
                         fetch_blog=blog)

    if not fetched:
        logger.info('nothing need to fetch, just test login')

    if fetched or refresh_count:
        update_fetch_info(uid)

Exemple #2

0

Afficher le fichier

Fichier : crawlmanager.py Projet : jxluo/Tiantian

    def run(self):
        self.dataBase = createProdDataBase()
        self.renrenAccountPool = createProdRenrenAccountPool()
        for i in range(0, self.ROUND_NUMBER):
            log.info('>>>>>>>>  Main Crawl Thread Round(%s)  <<<<<<<<' % (i+1))

            if self.dataBase.needMoreStartNode():
                startNodeCrawler = StartNodeCrawler(\
                    self.dataBase, self.renrenAccountPool)
                startNodeCrawler.startCrawling()

            self.startMultiThreadCrawling(self.THREAD_NUMBER)
            #self.startMultiThreadCrawlingWithProxy(1)
            #manager.startSignleThreadCrawling()

            try:
                Crawler.detectStopSignal()
            except Exception, e:
                break

            log.info('>>>>>> Router disconnect PPPoE  <<<<<<')
            router.disconnectPPPoE()
            time.sleep(2)
            log.info('>>>>>> Router connect PPPoE  <<<<<<')
            router.connectPPPoE()
            # Wait for the connection being established.
            time.sleep(10)

Exemple #3

0

Afficher le fichier

Fichier : crawlertest.py Projet : jxluo/Tiantian

def test():

    log.config(GC.LOG_FILE_DIR + 'crawler_test', 'info', 'info')
    db = createConnection()
    createTables(db)
    dropTables(db)
    createTables(db)

    pool = renrenaccountpool.createProdRenrenAccountPool()
    accounts = pool.getAccounts(1)
    account = accounts[0]

    global crawler
    
    try:
        crawler = Crawler(db)
        agent = RenrenAgent(account)
        agent.login()
        crawler.setAgent(agent)
        id = "322601086"
        crawler.crawl(id, 30)
    except CrawlerException, e:
        log.info("Crawler end, reason: " + str(e))
        if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL:
            print "detect int signal"
            return

Exemple #4

0

Afficher le fichier

    def checkforconfig(self, dir):
        filename = 'config.ini'
        filepath = os.path.join(dir, filename)
        if os.path.isfile(filepath) == True:
            """Pass the configfile for crawling"""
            print('Config file found...')
            sleep(2)
            print('Crawling has began...')
            Crawler(filepath)
        else:
            '''Create a config file here then pass for crawling'''
            print('No config file')
            sleep(2)
            print('Creating new config file with default settings...')
            config = configparser.ConfigParser()
            config['DEFAULT'] = {'StandardMedia':'http://www.standardmedia.co.ke/'}

            with open(filename,mode='w') as configfile:
                config.write(configfile)
                print('Config File created...')
            sleep(2)
            print('Crawling has began...')
            Crawler(filepath)

Exemple #5

0

Afficher le fichier

Fichier : getter.py Projet : whitefly/ProxyPool

class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def is_over_limet(self):
        return self.redis.get_count() > POOL_MAX_COUNT

    def run(self):
        if not self.is_over_limet():
            for crawFunc_label in self.crawler.__CrawFunc__:
                proxies = self.crawler.get_proxies(crawFunc_label)
                for proxy in proxies:
                    if not self.redis.exist(proxy):
                        self.redis.add(proxy)

Exemple #6

0

Afficher le fichier

Fichier : generateReviews.py Projet : KamalAmarouche/Crawling-Reviews-from-gsmarena-website

    def crawl(self):
        soup = Crawler.get_soup(self.url)
        self.url = 'http://www.gsmarena.com/' + soup.find(
            'a', text='Read all opinions')['href']
        soup = Crawler.get_soup(self.url)
        review_page_count = int(
            soup.find('div', {
                'id': 'user-pages'
            }).findAll('a')[-2].getText())

        url = self.url
        for i in range(2, review_page_count):
            reviews = soup.findAll('p', {'class': 'uopin'})
            for r in reviews:
                for tag in r.findAll('a'):
                    tag.replaceWith('')

                for tag in r.findAll('span'):
                    tag.replaceWith('')

                print(r.getText().strip())

            url = self.url.replace('.php', 'p%d.php' % i)
            soup = Crawler.get_soup(url)

Exemple #7

0

Afficher le fichier

Fichier : getter.py Projet : whitefly/ProxyPool

 def __init__(self):
     self.crawler = Crawler()
     self.redis = RedisClient()

Exemple #8

0

Afficher le fichier

def prepare_crawler(args):
    from crawl.crawler import Crawler

    config.crawler = Crawler(args.email, args.password, Crawler.load_cookie())

    return config.crawler

Exemple #9

0

Afficher le fichier

Fichier : main.py Projet : jeffreywangcf/shuaituzhibin-crawler

from crawl.crawler import Crawler

start_url = "http://stzb.163.com/card_list.html"
demo = Crawler(start_url)
demo.startCrawl()

Exemple #10

0

Afficher le fichier

Fichier : crawlmanager.py Projet : jxluo/Tiantian

def detectSignal(a, b):
    print "INT Signal detect"
    Crawler.setStopSignal()

Exemple #11

0

Afficher le fichier

Fichier : crawlmanager.py Projet : jxluo/Tiantian

    def run(self):
        log.info('>>>>>>  Thread %s start.  <<<<<<' % self.threadId)
        crawler = Crawler(self.dataBase)
        dataBase = self.dataBase
        agent = None
        account = None
        startNode = None
        startNodeRowId = None
        try:
            while True:
                # Prepare for agent, account and startnode.
                if not startNode:
                    startNode, startNodeRowId = dataBase.getStartNode()
                    log.info('Thread %s, startnode: %s, %s' %\
                        (self.threadId, startNode, startNodeRowId))
                    if not startNode or not startNodeRowId:
                        # No avaliable start node, exit crawling.
                        log.error(
                            'No start node for thread %s, exit crawling.' %\
                            (self.threadId, ))
                        break
                if not agent or not account:
                    agent,account = self.getAgentWithAccount()
                    if not agent or not account:
                        # No avaliable account, exit crawling.
                        log.warning(
                            'No avaliable agent for thread %s, exit crawling.' %\
                            (self.threadId, ))
                        break

                # One crawling process.
                crawler.setAgent(agent)
                try:
                    crawler.crawl(startNode)
                except CrawlerException, e:
                    log.info('Thread %s gets exception: %s' %\
                        (self.threadId, str(e)))
                    if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL:
                        log.info("Thread " + str(self.threadId) +\
                            " stop crawling because of stop signal.")
                        break
                    if e.errorCode ==\
                        CrawlerErrorCode.GET_EXPANDING_NODE_FAILED or\
                        e.errorCode == CrawlerErrorCode.EXPAND_EXPANDED_NODE or\
                        e.errorCode == CrawlerErrorCode.NO_NODE_TO_EXPAND:
                        # Start node's bad.
                        log.warning('Thread %s, bad start node: %s, %s' %\
                            (self.threadId, startNode, startNodeRowId))
                        dataBase.deleteFromStartList(startNode)
                        startNode = startNodeRowId = None
                    if e.errorCode == CrawlerErrorCode.REQUEST_FAILED:
                        # Still start node's bad.
                        # TODO: Implement invalid usernode test support in
                        # database to change it.
                        log.warning('Thread %s, bad start node: %s, %s' %\
                            (self.threadId, startNode, startNodeRowId))
                        dataBase.deleteFromStartList(startNode)
                        startNode = startNodeRowId = None
                    if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT:
                        # Use a new accout
                        account.finishUsing()
                        account = agent = None
                finally:
                    # The start node change every time crawler.epand() called.
                    # So the start node can not be reused when exception happen.
                    # We need to release it and use a new one.
                    if startNodeRowId:
                        dataBase.releaseStartNode(startNodeRowId)
                        startNode = startNodeRowId = None

Exemple #12

0

Afficher le fichier

Fichier : fetch.py Projet : acalephx/renrenBackup

def prepare_crawler(args):
    from crawl.crawler import Crawler

    config.crawler = Crawler(args.email, args.password)

    return config.crawler