Beispiel #1
0
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.com/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #2
0
 
def test():   
   import socket   
   socket.setdefaulttimeout(10)   
   print 'start testing'   
   wm = WorkerManager(3)   
   for i in range(1,11):   
       wm.add_job( test_job, i, i*0.001 )   
   wm.wait_for_complete()   
Beispiel #3
0
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages=""):
            msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0

        msg = "Checking: whether user(%s) exists or not..." % self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:
            return

        if not is_exist:
            msg = "Not exist: %s." % self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)

        num_pages = _crawl(parser, self.uid, page=1)

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        msg = "Crawl user(%s)'s weibos: total page=%s," " cost time=%s sec, connections=%s" % (
            self.uid,
            num_pages,
            cost_time,
            self.fetcher.n_connections,
        )
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #4
0
    def crawl_msg_reposts(self):
        def _crawl(parser, msg_id, page, num_pages=""):
            msg = "Crawl message(%s)'s reposts-page:%s:%s" % (self.msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_repost(msg_id, page)

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)
            except:
                pass

            return num_pages

        msg = "Checking: whether message exists or not..."
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:
            msg = "Not exist: %s." % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)

        start_time = time.time()

        parser = ComRepostsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)
        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)

            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)

        msg = "Crawl message(%s)'s reposts: total page=%s," " cost time=%s sec, connections=%s" % (
            self.msg_id,
            num_pages,
            cost_time,
            self.fetcher.n_connections,
        )
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #5
0
 def crawl_weibos(self):
     def _crawl(parser, uid, page, num_pages=''):
         msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
         write_message(msg, self.window)
     
         html = self._fetch_weibo(uid, page)
         
         try:
             pq_doc = pq(html)
             return parser.parse(pq_doc)
         except:
             return 0
         
     msg = 'Checking: whether user(%s) exists or not...' %self.uid
     write_message(msg, self.window)
     
     is_exist = self.fetcher.check_user(self.uid)
     
     if is_exist is None:
         return
     
     if not is_exist:
         msg = 'Not exist: %s.' %self.uid
         logger.info(msg)
         write_message(msg, self.window)
         
         return
     
     self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)
     
     start_time = time.time()
     
     parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)
     
     num_pages = _crawl(parser, self.uid, page=1)
             
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
         
         worker_manager.wait_all_complete()
     
     cost_time = int(time.time() - start_time)
     msg = ('Crawl user(%s)\'s weibos: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
Beispiel #6
0
 def crawl_msg_reposts(self):
     def _crawl(parser, msg_id, page, num_pages=''):
         msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_repost(msg_id, page)
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
         except:
             pass
         
         return num_pages
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return
       
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)
     
     start_time = time.time()
     
     parser = ComRepostsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl message(%s)\'s reposts: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window) 
 def crawl_fans(self):
     def _crawl(parser, uid, page, num_pages='?'):
         msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
         write_message(msg, self.window)
         
         url  = 'http://weibo.cn/%s/fans?page=%s' %(uid, page)
         html = self._fetch(url)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             return parser.parse(pq_doc)
         except:
             return None
         
     msg = 'Checking: whether user(%s) exists or not...' %self.uid
     write_message(msg, self.window)
     is_exist= self.fetcher.check_user(self.uid)
     
     if is_exist is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
     
     if not is_exist:
         msg = 'Not exist: %s.' %(self.uid)
         logger.info(msg)
         write_message(msg, self.window)
         
         return False
     
     self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path)
     
     start_time = time.time()
     
     parser = CnFansParser(self.storage)
     
     num_pages = _crawl(parser, self.uid, page=1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
             except:
                 pass
         
             return None
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl user(%s)\'s fans: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
 def crawl_msg_comments(self):
     def _crawl(parser, msg_id, page, num_pages='?'):
         msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_comment(msg_id, page)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
             
             return num_pages
         except:
             return None
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:      #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
         
     if msg_id is False:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return False 
     
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path)
     
     start_time = time.time()
     
     parser = ComCommentsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
             except:
                 pass
                     
             return None
     
     cost_time = int(time.time() - start_time)
         
     msg = ('Crawl message(%s)\'s comments: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
# encoding: utf-8

from thread_pool import WorkerManager
# import sys
import time

def do_job(msg):
#     sys.stdout.write(msg)
#     print 'in do job:', msg
    return msg
    
    
if __name__ == '__main__':
    st = time.time()
    wm = WorkerManager(5, 5)
    
    wm.add_job(do_job, None)
    
    for i in range(1, 100):
        wm.add_job(do_job, i)
    
    wm.wait_all_complete()
    res = wm.get_result()
    wm.stop()
    print 'res:', res
    ed = time.time()
    
    print 'cost time: %s' %(ed - st)
Beispiel #10
0
        uids_storage=follows)
sw.main(fetcher,
        fetch_data='fans',
        store_path='./file/',
        uids=memstorage.users_id_moniterd,
        uids_storage=fans)

friends_list = list(set(fans) | set(follows))

print friends_list
#host's weibo
sw.main(fetcher,
        fetch_data='weibos',
        store_path='./file/',
        uids=memstorage.users_id_moniterd)
#friends' weibo
n_threads = 10
n_paritions = 10
len_partition = len(friends_list) / n_paritions

worker_manager = WorkerManager(n_threads)
for i in range(0, len(friends_list), len_partition):
    worker_manager.add_job(
        sw.main,
        fetcher,
        fetch_data='weibos',
        store_path='./file/',
        uids=friends_list[i:min(i + len_partition, len(friends_list))])

worker_manager.wait_all_complete()
Beispiel #11
0
fetcher = ComWeiboFetcher(username=account.user, password=account.pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

fans = []
follows = []

sw.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=follows)
sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans)

friends_list = list(set(fans)|set(follows))

print friends_list
#host's weibo
sw.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd)
#friends' weibo
n_threads = 10
n_paritions = 10
len_partition = len(friends_list)/n_paritions

worker_manager = WorkerManager(n_threads)
for i in range(0,len(friends_list),len_partition):
	worker_manager.add_job(sw.main, fetcher, fetch_data='weibos',store_path='./file/',
		uids=friends_list[i:min(i+len_partition,len(friends_list))] )  

worker_manager.wait_all_complete()
Beispiel #12
0
    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = CnFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
Beispiel #13
0
    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' % (
                msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_comment(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is False:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT,
                                   self.store_path)

        start_time = time.time()

        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.comments_fp,
                                    self.storage.comments_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.comments_fp,
                                        self.storage.comments_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
# encoding: utf-8

from thread_pool import WorkerManager
# import sys
import time


def do_job(msg):
    #     sys.stdout.write(msg)
    #     print 'in do job:', msg
    return msg


if __name__ == '__main__':
    st = time.time()
    wm = WorkerManager(5, 5)

    wm.add_job(do_job, None)

    for i in range(1, 100):
        wm.add_job(do_job, i)

    wm.wait_all_complete()
    res = wm.get_result()
    wm.stop()
    print 'res:', res
    ed = time.time()

    print 'cost time: %s' % (ed - st)