Beispiel #1
0
def jp2_to_jpeg(
    _threads,
    _app,
    _source,
    _destination,
    _broken,
    _jpegs,
    _verbose
    ):
    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_destination):
        subpath = root.replace(_destination, '').lstrip('/')
        if _broken not in subpath:
            if any(".jp2" in s for s in files):
                print >>emaillog, 'Converting contents of ' + subpath + ' from JP2 to JPEG'
	    for (output_file, size) in _jpegs:
                for file in files:
                    if file.endswith('.jp2'):
                        jp2 = os.path.join(root, file)
                        newfile = os.path.join(root,
                                  os.path.splitext(file)[0]) + '_' \
                                  + output_file
                        command = _app + ' -size ' + size + " " + jp2 \
                                  + ' -resize ' + size + ' ' + newfile
                        if _verbose == True:
                            print 'Creating ' + newfile
			t.add_task(executeConversion,command,None,jp2,_source,_broken,file,newfile)
	        t.await_completion()
Beispiel #2
0
 def testcase_ThreadPool_init_thread_pool_success(self):
     """测试用例2:初始化函数中初始化线程池"""
     jobs = [str(i) for i in xrange(2)]
     pool = ThreadPool(3, test_function, jobs, 0)
     thread_count = len(pool.threads)
     self.assertEqual(3, thread_count)
     pool.wait_allcomplete()
Beispiel #3
0
def tif_to_jp2(
    _threads,
    _app,
    _source,
    _destination,
    _broken,
    _options,
    _verbose
    ):
 
    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
        subpath = root.replace(_source, '').lstrip('/')
        if _broken not in subpath:  
            jp2Path = os.path.join(_destination,subpath)
            makeDir(jp2Path)
            if any(".tif" in s for s in files):
                print >>emaillog, 'Converting contents of ' + subpath + ' from TIF to JP2'
            for file in files:
                if file.endswith('.tif'):
                    tiff = os.path.join(root, file)
                    jp2 = os.path.join(_destination, subpath,
                                       os.path.splitext(file)[0] + '.jp2')
		    tiffcopy = os.path.join(_destination,subpath,file)
                    command = _app + ' -i ' + tiff + ' -o ' + jp2 + ' ' \
                        + _options
                    command_post = 'shutil.move(\'' + tiff + '\',\'' + tiffcopy + '\')'
		    if _verbose == True:
			print 'Creating ' + jp2
	            t.add_task(executeConversion,command,command_post,tiff,_destination,_broken,file,jp2)
        t.await_completion()
Beispiel #4
0
def test():
    print 'start testing'
    wm = ThreadPool(10)
    for i in range(1):
        wm.add_job(test_job, i, i*0.001)
    wm.wait_for_complete()
    print 'end testing'
Beispiel #5
0
def iterate(_source,_ignore,_patron,_patron_zip,_threads):
    print 'Descend into ' + _source
 
    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
	t.add_task(patron_bundle,_patron,_patron_zip,root)
    t.await_completion()
Beispiel #6
0
def articles():
    error = None
    if request.method == 'POST':
        query = request.form['query'].lower()
        query = query.replace(" ", "%20")
        if query in app.cache:
            return Response(json.dumps(app.cache[query]), mimetype='application/json')
        url = GOOGLE_NEWS_RSS+query
        response = requests.get(url).text

        # Get article links from the RSS
        links = []
        utils.find_links(links, response)
        if len(links) > 2:
            links = links[2:]

        # Get article titles from the RSS
        titles = []
        utils.find_titles(titles, response)
        if len(titles) > 2:
            titles = titles[2:]

        num_links = len(links)
        num_titles = len(titles)
        if num_titles < num_links:
            links = links[:num_titles]
        if num_links < num_titles:
            titles = titles[:num_links]

        articles = {}
        pool = ThreadPool(num_links)
        for i in range(num_links):
            pool.add_task(
                utils.get_article_sentiment, links[i], titles[i], articles)
        pool.wait_completion()

        sentiments = []

        result = {}
        result["articles"] = []
        index = 0
        for key in articles:
            info = {}
            info["id"] = index
            index += 1
            info["title"] = articles[key]["title"]
            info["link"] = key
            sentiments.append(articles[key]["sentiment"])
            info["sentiment"] = articles[key]["sentiment"]
            info["snippet"] = articles[key]["snippet"]
            result["articles"].append(info)

        average_sentiment = utils.average_sentiment(
            sentiments,
            len(sentiments))
        result["sentiment"] = average_sentiment
        app.cache[query] = deepcopy(result)
        return Response(json.dumps(result), mimetype='application/json')
Beispiel #7
0
 def __init__(self, config_name="config.json"):
     self.__config = ServerConfig(config_name)
     logging.basicConfig(filename=self.__config.log_file,
                         level=logging.DEBUG,
                         format='%(asctime)s %(message)s')
     self.thread_pool = ThreadPool()
     cache_dir = Path.cwd() / self.__config.cache_dir
     self.cache = CacheStorage(cache_dir)
     self.request_handler = RequestHandler(self.cache)
Beispiel #8
0
def sentiment():
    error = None
    if request.method == 'POST':
        query = request.form['query']
        query = query.replace(" ", "%20")
        if query in app.sentiment_cache:
            return Response(
                json.dumps(app.sentiment_cache[query]),
                mimetype='application/json')
        url = GOOGLE_NEWS_RSS+query
        response = requests.get(url).text

        # Get article links from the RSS
        links = []
        utils.find_links(links, response)
        if len(links) > 2:
            links = links[2:]

        # Get article titles from the RSS
        titles = []
        utils.find_titles(titles, response)
        if len(titles) > 2:
            titles = titles[2:]

        num_links = len(links)
        num_titles = len(titles)
        if num_titles > 3:
            num_titles = 3
        if num_titles < num_links:
            links = links[:num_titles]
        if num_links > 3:
            num_links = 3
        if num_links < num_titles:
            titles = titles[:num_links]

        articles = {}
        pool = ThreadPool(num_links)
        for i in range(num_links):
            pool.add_task(
                utils.get_article_sentiment, links[i], titles[i], articles)
        pool.wait_completion()

        sentiments = []

        result = {}
        for key in articles:
            print query, ':', articles[key]["sentiment"]
            sentiments.append(articles[key]["sentiment"])

        average_sentiment = utils.average_sentiment(
            sentiments,
            len(sentiments))
        print average_sentiment
        result["sentiment"] = average_sentiment
        result["query"] = request.form['query']
        app.sentiment_cache[query] = deepcopy(result)
        return Response(json.dumps(result), mimetype='application/json')
Beispiel #9
0
 def test_results(self):
     def my_add(a,b):
         return a+b
     
     tp = ThreadPool(5)
     for i in range(5):
         tp.add_task(my_add, i, i)
     d = tp.wait_completion()
     vals = d.values()
     vals.sort()
     assert vals == [0, 2, 4, 6, 8]
Beispiel #10
0
def start_tasks():
    stores = load_stores()
    thread_pool = ThreadPool(size=20)
    total = len(stores)
    pos = 0
    for store in stores:
        pos += 1
        task = UnderLoadSlotZeroTask(store=store, total=total, pos=pos)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting for task exit!')
    thread_pool.join()
def main():
    store_list = load_stores()
    thread_pool = ThreadPool(size=20)
    pos = 0
    total = len(store_list)
    for store in store_list:
        pos += 1
        task = SlotStateFetchTask(store, pos=pos, total=total)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting for tasks exit!!!')
    thread_pool.join()
Beispiel #12
0
 def testcase_ThreadPool_set_work_queue_success(self):
     """测试用例1:初始化函数中设置工作队列成功"""
     jobs = [str(i) for i in xrange(2)]
     pool = ThreadPool(2, test_function, jobs, 0)
     while True:
         try:
             func, param = pool.work_queue.get(block=False)
             res = func(param)
             self.assertEqual(str(0), res)
         except Queue.Empty as e:
             self.logging.info(e)
             break
     pool.wait_allcomplete()
Beispiel #13
0
def start_tasks():
    thread_pool = ThreadPool(size=20)
    store_list = load_stores()
    total_count = len(store_list)
    count = 0
    for store in store_list:
        count += 1
        task = FetcherTask(store=store, num=count, total=total_count)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    thread_pool.start()
    print('Waiting Task Finished......')
    thread_pool.join()
def main():
    store_list = load_stores()
    thread_pool = ThreadPool(size=20)
    index = 0
    total = len(store_list)
    for store in store_list:
        index += 1
        task = CompensationDisableTask(store=store, index=index, total=total)
        thread_pool.push_task(task)
    thread_pool.init_pool()
    print('Starting tasks...')
    thread_pool.start()
    print('Waiting for task exit!')
    thread_pool.join()
Beispiel #15
0
def main ():
    logging.basicConfig(filename='debug.log', filemode='w')
    with Imap(url) as imap:
        imap.login(address, password)
        logging.info('Logged in')
        folders = imap.get_folders()
        # print('Found folders:', len(folders))
        # all_uids = get_uids_and_count(imap, folders)
    ui = UI()
    # 15 - imap simultaneous connections limit

    tasks = [(process_messages, (folder, ui)) for folder in folders]
    pool = ThreadPool(max=15)
    pool.run(tasks, delay=1)
Beispiel #16
0
    def start(self):
        if not self.db_oper.is_enabled():
            return

        repo_list = self.db_oper.get_repo_list()
        if repo_list is None:
            self.db_oper.close_db()
            return

        thread_pool = ThreadPool(self.scan_virus, self.settings.threads)
        thread_pool.start()

        for row in repo_list:
            repo_id, head_commit_id, scan_commit_id = row

            if head_commit_id == scan_commit_id:
                logger.debug('No change occur for repo %.8s, skip virus scan.',
                             repo_id)
                continue

            thread_pool.put_task(
                ScanTask(repo_id, head_commit_id, scan_commit_id))

        thread_pool.join()

        self.db_oper.close_db()
Beispiel #17
0
class BaseBot:
    def __init__(self, service):
        self.service = service
        self.error_count = 0
        self.thread_pool = ThreadPool()

    # thread-safe 하다고 나와있긴 하지만 보장이 되는지 고민해 봐야함
    # callback 패턴으로 바꾸는 것 고려
    def handle_event(self, events):
        for event in events:
            if isinstance(event, ChatInitEvent):
                #print(event.user_index)
                self.handle_entered_room(event.team_index, event.room_index)

            elif isinstance(event, ChatMessageEvent):
                chat = self.service.get_chat_summary(event.room_index,
                                                     event.msg_index)
                if chat:
                    self.handle_chat(event.team_index, event.room_index, chat)

            elif isinstance(event, UserDropEvent) \
                    and self.service.my_index == event.user_index:
                logger.error("봇 계정이 탈퇴되었습니다.")
                sys.exit()
            elif isinstance(event, UserPasswordChangedEvent) \
                    and self.service.my_index == event.user_index:
                logger.error("봇 계정의 비밀번호가 바뀌었습니다.")
                sys.exit()

    def run(self):
        while True:
            try:
                events = self.service.get_events()
                if events:
                    self.thread_pool.add_task(self.handle_event, events)
                time.sleep(self.service.config['lp_idle_time'])
            except Exception as e:
                self.error_count += 1
                if self.error_count > 3:
                    logger.error("오류가 발생했습니다. 프로그램을 종료합니다.")
                    #sys.exit()
                else:
                    logger.error("오류가 발생했습니다.:" + e)
                    time.sleep(5)

    def handle_entered_room(self, team_index, room_index):
        raise NotImplementedError()

    def handle_chat(self, team_index, room_index, chat):
        raise NotImplementedError()
Beispiel #18
0
 def testcase_ThreadPool_get_result_success(self):
     """测试用例3:get_result,所有任务执行完后结果为1"""
     jobs = [i for i in xrange(2)]
     pool = ThreadPool(3, test_function, jobs, 0)
     pool.wait_allcomplete()
     sum = 0
     while True:
         try:
             res = pool.get_result()
             arr_res = json.loads(res)
             sum += int(arr_res['url'])
         except Queue.Empty as e:
             self.logging.info(e)
             break
     self.assertEqual(1, sum)
Beispiel #19
0
class Listener():
    def __init__(self, redis_conn, channels):
        self.redis_conn = redis_conn
        self.pubsub = self.redis_conn.pubsub()
        self.pubsub.subscribe(channels)
        self.thread_pool = ThreadPool(size=10)
    
    def work(self, item):
        # 修改成你的代码逻辑
        print item["channel"], item["data"]
    
    def run(self):
        self.thread_pool.start()
        for item in self.pubsub.listen():
            self.thread_pool.append_job(self.work, item)
 def run(self):
     cx, cu = self.db_connection()
     pool = ThreadPool(size=20)
     pool.start()
     file_submission_id = open(FILE_SUBMISSION_ID)
     finished_submissions = [int(item[0]) for item in cu.execute("select submission_id from code")]
     all_submissions = [int(item) for item in file_submission_id.readlines()]
     for line in list(set(all_submissions).difference(set(finished_submissions))):
         sleep(0.2)
         pool.append_job(s.job, line)
     pool.join()
     pool.stop()
Beispiel #21
0
    def __init__(self, thread_count: int, host: str, port: str, db_name: str,
                 user: str, channel_name: str) -> None:
        """
        Конструктор класса

        Инициализирует:
         - количество потоков
         - подключение к базе данных
         - один пул потоков

        :param thread_count: количество потоков в пуле потоков
        :param host: hostname, на которой развернута база данных
        :param port: порт подключения к базе данных
        :param db_name: наименование базы данных
        :param user: роль для подключения к базе данных
        :param channel_name: наименование канала, в который поступают сообщения от базы данных
        """
        self._host = host
        self._port = port
        self._db_name = db_name
        self._user = user
        self._thread_count = thread_count
        self._channel_name = channel_name
        self._e = self.connect()
        self.pool_task = ThreadPool(self._thread_count)
def main(event, lambdacontext):  
    starttime = time.time()    
    queue_url = event.get(c.KEY_SQS_QUEUE_URL, None)        
    print "Started consumer with queue url '{}'".format(queue_url)    
    context = event.get("context", {})        
    context[c.KEY_SQS_QUEUE_URL] = queue_url        
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(lambdacontext, 'function_name') else None
    context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(lambdacontext, 'aws_request_id') else None
    context[c.KEY_IS_LAMBDA_ENV] = context[c.KEY_REQUEST_ID] is not None
      
    prefix = util.get_stack_name_from_arn(os.environ[c.ENV_DEPLOYMENT_STACK_ARN])    

    context[c.KEY_STACK_PREFIX] = prefix
    context[c.KEY_SQS] = Sqs(context, "{0}_".format(prefix))
    context[c.KEY_SQS_AMOEBA] = Sqs(context, "{0}{1}_".format(prefix, c.KEY_SQS_AMOEBA_SUFFIX))
    context[c.KEY_SQS_AMOEBA].set_queue_url(lowest_load_queue=True)    
    context[c.KEY_LAMBDA] = Lambda(context)
    context[c.KEY_CLOUDWATCH] = CloudWatch(context)
    
    context[c.KEY_THREAD_POOL] = ThreadPool(context, 8)               
    context[c.KEY_METRIC_BUCKET] = os.environ[c.RES_S3_STORAGE]            
    
    context[c.KEY_START_TIME] = starttime
    context[c.CW_ATTR_SAVE_DURATION] = context[c.KEY_CLOUDWATCH].avg_save_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]))
    context[c.CW_ATTR_DELETE_DURATION] = context[c.KEY_CLOUDWATCH].avg_delete_duration(util.get_cloudwatch_namespace(os.environ[c.ENV_DEPLOYMENT_STACK_ARN]))    
          
    context[c.KEY_SUCCEEDED_MSG_IDS] = []
    process(context)    
    del context
    gc.collect()
    return {        
        'StatusCode': 200        
    }
Beispiel #23
0
class DouBanPipeline(object):
    pool = ThreadPool(1)
    mongo_client = MongoDBApi()

    @classmethod
    def save_item(cls, item):
        cls.pool.callInThread(cls.__save_itme, item)
        pass

    @classmethod
    def __save_itme(cls, item):
        try:
            comments = []
            if item.has_key('comments'):
                comments = item.pop('comments')
            insert_id = cls.mongo_client.insert_one(item)
            if insert_id:
                insert_id = ObjectId(insert_id)
                for index, comment in enumerate(comments):
                    comment['movie_id'] = insert_id
                    comments[index] = comment
                if comments:
                    insert_ids = cls.mongo_client.insert_many(
                        comments, 'movie_comments')
                logging.warn(u'========保存一条信息=======\n')
            else:
                logging.warn(u'========一条信息保存失败=======\n')
        except Exception as err:
            logging.error(traceback.format_exc(err))
Beispiel #24
0
class DnsServer:
    def __init__(self, config_name="config.json"):
        self.__config = ServerConfig(config_name)
        logging.basicConfig(filename=self.__config.log_file,
                            level=logging.DEBUG,
                            format='%(asctime)s %(message)s')
        self.thread_pool = ThreadPool()
        cache_dir = Path.cwd() / self.__config.cache_dir
        self.cache = CacheStorage(cache_dir)
        self.request_handler = RequestHandler(self.cache)

    def run(self):
        """Binds, listens, processing DNS requests on socket"""
        signal.signal(signal.SIGINT, self.__handle_exit)
        s = socket(AF_INET, SOCK_DGRAM)
        s.bind((self.__config.server_host, self.__config.server_port))
        s.settimeout(self.__config.server_timeout)
        logging.info(
            f'Launched at {self.__config.server_host}:{self.__config.server_port}'
        )
        while True:
            try:
                data, addr = s.recvfrom(self.__config.recv_buff_size)
            except SystemExit as e:
                s.close()
                break
            except Exception as e:
                logging.info(str(e))
                s.close()
                break

            self.thread_pool.add_task(self.__process_request, data, addr, s)

    def __process_request(self, data, addr, s_socket):
        response = self.request_handler.handle_query(data)
        s_socket.sendto(response, addr)

    def __handle_exit(self, signal, frame):
        logging.info("Received SIGINT, shutting down threads...")
        print("shutting down...")
        self.thread_pool.tasks.join()
        self.thread_pool.terminate_all_workers()
        logging.info("Threads stopped, updating cache")
        self.cache.cleanup()
        sys.exit(0)
Beispiel #25
0
 def __init__(self, bot_jid, stream):
     self.bot_jid = bot_jid
     self._stream = stream
     self.cmd_handler = CommandHandler(message_bus = self)
     self.admin_cmd_handler = AdminCMDHandler(message_bus = self)
     self._thread_pool = ThreadPool(5)
     self._thread_pool.start()         # 启动线程池
     self.logger = get_logger()
     return
Beispiel #26
0
def download_feed(feed, feed_tag):
	new_subtask(len(feed) * 2)
	item_thread_pool = ThreadPool()
	for entry in feed.get_entries():
		increment_subtask()
	
		app_globals.STATS['items'] += 1

		if entry is None:
			app_globals.STATS['failed'] += 1
			error(" ** FAILED **")
			debug("(entry is None)")
			continue
		
		item = Item(entry, feed_tag)
		process_item(item, item_thread_pool)
		item_thread_pool.collect()
	item_thread_pool.collect_all()
Beispiel #27
0
def main1():
    thread_pool = ThreadPool(20)
    thread_pool.start()
    session = get_session()
    topic_query = \
        session.query(VideoTopic).filter(VideoTopic.video_type == 1)
    for topic in topic_query:
        thread_pool.add_task(job, topic.henhen_id)
    session.close()
    thread_pool.wait_done()
Beispiel #28
0
def test():
    print 'start testing'
    wm = ThreadPool(10)
    for i in range(1):
        wm.add_job(test_job, i, i * 0.001)
    wm.wait_for_complete()
    print 'end testing'
Beispiel #29
0
def download():
    lines = ["Topics"]
    thread_pool = ThreadPool()
    d = False

    if enable_proxie[0]:
        refresh_proxie()

    filename = datetime.datetime.now().strftime("%d-%m-%Y %H-%M-%S") + '.txt'
    for i, enable in enumerate(download_enables):
        if enable:
            thread_pool.give_task(download_concrete_page,
                                  args=(download_hrefs[i], lines))
            d = True

    thread_pool.join()
    if d:
        file = open(filename, 'w')
        file.write('\n'.join(lines))
        file.close()

        print(f'All chosen topics are saved to {filename}')
        to_main_menu()
    else:
        print("Nothing is chosen")
    input("Press <Enter> to continue")

    return True
Beispiel #30
0
def jp2_to_jpeg(_threads, _app, _source, _destination, _broken, _jpegs,
                _verbose):
    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_destination):
        subpath = root.replace(_destination, '').lstrip('/')
        if _broken not in subpath:
            if any(".jp2" in s for s in files):
                print >> emaillog, 'Converting contents of ' + subpath + ' from JP2 to JPEG'
            for (output_file, size) in _jpegs:
                for file in files:
                    if file.endswith('.jp2'):
                        jp2 = os.path.join(root, file)
                        newfile = os.path.join(root,
                                  os.path.splitext(file)[0]) + '_' \
                                  + output_file
                        command = _app + ' -size ' + size + " " + jp2 \
                                  + ' -resize ' + size + ' ' + newfile
                        if _verbose == True:
                            print 'Creating ' + newfile
                        t.add_task(executeConversion, command, None, jp2,
                                   _source, _broken, file, newfile)
                t.await_completion()
Beispiel #31
0
 def pool_time(thread_num):
     start = time.clock()
     tp = ThreadPool(thread_num)
     for i in range(5):
         tp.add_task(time.sleep, i)
     tp.wait_completion()
     return time.clock() - start
Beispiel #32
0
class Interface(object):
    def __init__(self):

        self._read_config()
        self._init_threadpool()

    def _read_config(self):
        self.pipe_file = Base.get_config("QUEUE", "PIPE_FILE")
        self.queue_size = Base.get_config("QUEUE", "QUEUE_SIZE")
        self.thread_pool_num = Base.get_config("THREADPOOL", "NUM")
        self.pipe_fd = os.open(self.pipe_file, os.O_NONBLOCK | os.O_CREAT | os.O_RDWR)

    def _init_threadpool(self):
        self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size))

    def write(self, string):
        print string

    def transcode(self, string):
        self.pool.add_job(self._transcode, string)

    def _transcode(self, filepath):

        print filepath
        time.sleep(10)
        print "ok"

    def __getattribute__(self, name):

        try:
            res = object.__getattribute__(self, name)

        except:
            res = None

        return res

    def __del__(self):

        os.close(self.pipe_fd)
Beispiel #33
0
def tif_to_jp2(_threads, _app, _source, _destination, _broken, _options,
               _verbose):

    testApp(_app)

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
        subpath = root.replace(_source, '').lstrip('/')
        if _broken not in subpath:
            jp2Path = os.path.join(_destination, subpath)
            makeDir(jp2Path)
            if any(".tif" in s for s in files):
                print >> emaillog, 'Converting contents of ' + subpath + ' from TIF to JP2'
            for file in files:
                if file.endswith('.tif'):
                    tiff = os.path.join(root, file)
                    jp2 = os.path.join(_destination, subpath,
                                       os.path.splitext(file)[0] + '.jp2')
                    tiffcopy = os.path.join(_destination, subpath, file)
                    command = _app + ' -i ' + tiff + ' -o ' + jp2 + ' ' \
                        + _options
                    command_post = 'shutil.move(\'' + tiff + '\',\'' + tiffcopy + '\')'
                    if _verbose == True:
                        print 'Creating ' + jp2
                    t.add_task(executeConversion, command, command_post, tiff,
                               _destination, _broken, file, jp2)
        t.await_completion()
 def _init(self):
     # 线程池,指定线程数
     self.thread_pool = ThreadPool(self.thread_num)
     self.depth = 2
     # 标注初始爬虫深度,从1开始
     self.current_depth = 1
     # 已访问的链接
     self.visited_hrefs = set()
     # 待访问的链接
     self.unvisited_hrefs = deque()
     # 标记爬虫是否开始执行任务
     self.is_crawling = False
     self.resource_details = ResourceDetailCollection()
Beispiel #35
0
def main():
    pool = ThreadPool(20)
    pool.start()
    session = get_session()
    topic_query = session.query(PicTopic).filter(PicTopic.pic_type
            == 'dongmantupian').order_by(PicTopic.id.desc())
    for pic_topic in topic_query:
        pool.add_task(dump_job, pic_topic)
    session.close()

    pool.wait_done()
Beispiel #36
0
    def threadstart(self):
        self.tp = ThreadPool(10)
        self.tpool = Thread(target=self.startThreadPool, args=())
        self.tpool.daemon = True
        self.tpool.start()

        self.tdetect = Thread(target=self.detectConnect, args=())
        self.tdetect.daemon = True
        self.tdetect.start()

        self.ttimeout = Thread(target=self.detectConnectTimeOut, args=())
        self.ttimeout.daemon = True
        self.ttimeout.start()
Beispiel #37
0
def iterate(_source, _ignore, _patron, _patron_zip, _threads):
    print 'Descend into ' + _source

    t = ThreadPool(_threads)

    for (root, dirs, files) in os.walk(_source):
        t.add_task(patron_bundle, _patron, _patron_zip, root)
    t.await_completion()
Beispiel #38
0
def proxy_server(host, port):
    addr = host, port

    tp = ThreadPool(WORKERS)

    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

            sock.bind(addr)
            sock.listen(WORKERS)

            print('Listening on', addr)

            while True:
                if not select.select([sock], [], [], 1)[0]:
                    continue

                conn, client_addr = sock.accept()
                tp.add_task(proxy_connection, conn, client_addr)
    except socket.error:
        print('Cannot run the server')

    tp.wait_completion()
Beispiel #39
0
def main():
    thread_pool = ThreadPool(50)
    thread_pool.start()
    video_type = '7'
    base_url = 'http://www.toutoulu.com/vodlist/%s_%s.html'

    # init task

    for page_num in range(1, page_info[video_type] + 1):
        url = base_url % (video_type, page_num)
        print 'add task %s' % url
        thread_pool.add_task(thread_pool_job, url, video_type)

    thread_pool.wait_done()
Beispiel #40
0
 def __init__(self, myconfig):
     # 线程池, 
     self.thread_pool = ThreadPool(myconfig.threadnum)
     # 已访问的url集合
     self.visited_urls = set()
     # set 不是线程安全,所以这里加一把锁
     self.visited_urls_lock = threading.Lock()
     # 未访问的url集合
     self.will_visited_urls = deque()
     self.will_visited_urls.append(myconfig.url)
     self.temp_q = deque()
     self.cur_depth = 0
     self.status = ""
     self.myconfig = myconfig
     MyLogger(myconfig.logfile, myconfig.loglevel)
     #MyLogger(myconfig.logfile, loglevel = 5)  # debug
     self.db = Db()
Beispiel #41
0
def test_thread_pool_with_exception():
    """
    thread pool should be able to handle task processing
    even if there were exceptions in some tasks
    """
    thread_pool = ThreadPool()
    result = []

    def throw_ex_task():
        raise Exception()

    def populate_result_task():
        result.extend([i for i in range(0, 10)])
        return

    thread_pool.add_task(throw_ex_task)
    thread_pool.add_task(populate_result_task)

    thread_pool.tasks.join()
    thread_pool.terminate_all_workers()

    assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Beispiel #42
0
def test_thread_pool():
    """
    thread pool should be able to handle task processing
    """
    thread_pool = ThreadPool()
    result = []

    def populate_result_task():
        result.extend([i for i in range(0, 10)])
        return

    thread_pool.add_task(populate_result_task)
    thread_pool.tasks.join()
    thread_pool.terminate_all_workers()
    assert result == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Beispiel #43
0
    def compile(self, jobs=0):
        if 0 == jobs:
            jobs = cpu_count()

        self.print_msg('BS', 'Using %s parallel job(s)' % colored(str(jobs), 'yellow'))
        for target in self.targets:
            self.print_msg('BS', 'Building target %s' % colored(target.name, 'yellow'))
            pool = ThreadPool(jobs)
            for source in target.sources:
                args = (target, source, None)
                pool.apply_async(self.compile_object, args=args, callback=self.compile_object_done)
            try:
                self._wait_for_compilation(target)
            except BuildError as e:
                raise
            finally:
                pool.close()
                pool.join()
            self.run_prefinal(target)
            target.final(self)
def scrap_connections():
    areas=set()
    airports = []
    with open('csv_files/top_100_airports.csv', 'rb') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            for airport in row:
                airports.append(airport)

    origins = dests = airports

    dates = []

    for i in range(100):
        new_date = datetime.date(2014, 11, 02) + timedelta(days=i)
        dates.append(new_date)

    pool = ThreadPool(20)
    pool.start()
    for dest in dests:
        for origin in origins:
            if dest != origin:
                date = dates[randint(0, len(dates)-1)]
                pool.add_task(get_connections, origin, dest, date)
                areas.add(get_area(origin, dest))

    pool.wait_completion()

    #arrange all connections in single set per area instead of list
    areas_conn = dict()
    for area in areas:
        conn_list = flight_resp_dal.get_connections_in_area(area)
        connections = set()
        for conn in conn_list:
            connections.update(set(conn))
        areas_conn[area] = connections

    flight_resp_dal.clean_areas_to_connections_table()

    for area in areas:
        flight_resp_dal.add_connections_to_area(area, areas_conn[area])
Beispiel #45
0
def generate_threads(functionid, threads_count, iterations_per_thread, events_per_iteration, sleep_duration, use_lambda, event_type, sensitivity_type, compression_mode):
    start = time.time()       
    context = {}            
    threadpool = ThreadPool(context, threads_count)  
    context=dict({})        
    db = DynamoDb(context) 
    print "Sleep durations: ", sleep_duration
    print "Number of threads: ", threads_count
    print "Number of iterations per thread: ", iterations_per_thread
    print "Number of events per iteration: ", events_per_iteration
    print "Using event type: ", event_type
    print "Using sensitivity type: ", sensitivity_type
    print "Using compression mode: ", compression_mode
    for i in range(0, threads_count):          
        threadpool.add(thread_job, functionid, iterations_per_thread, events_per_iteration, use_lambda, context, sleep_duration, event_type, sensitivity_type, compression_mode)                                                    
    threadpool.wait()      
    print "A total of {} metrics have been sent to the FIFO queues.".format((iterations_per_thread*events_per_iteration)*threads_count)    
    print "The overall process took {} seconds.".format(time.time() - start)
Beispiel #46
0
def main(event, request):
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = request.function_name if hasattr(
        request, 'function_name') else None
    context[c.KEY_REQUEST_ID] = request.aws_request_id if hasattr(
        request, 'aws_request_id') else None
    stackid = os.environ[c.ENV_DEPLOYMENT_STACK_ARN]

    context[c.KEY_DB] = DynamoDb(context)
    context[c.KEY_ATHENA_QUERY] = Query(stackid)
    context[c.KEY_GLUE_CRAWLER] = Glue()
    thread_pool = ThreadPool(size=3)
    crawler_name = context[c.KEY_GLUE_CRAWLER].get_crawler_name(stackid)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()
    events = glue.get_events()

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
    now = datetime.datetime.utcnow()

    found = False
    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            found = crawler.exists(prefix)
            if found:
                print "FOUND new events=>", prefix
                break
            dt += timedelta(hours=1)
        if found:
            break

    if found:
        thread_pool.add(crawl, context, crawler_name,
                        context[c.KEY_ATHENA_QUERY].execute_with_format)
        thread_pool.wait()

    return custom_resource_response.success_response({}, "*")
Beispiel #47
0
def launch(event, lambdacontext):
    print "Start"
    hours_delta = 36
    context = dict({})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_REQUEST_ID] = lambdacontext.aws_request_id if hasattr(
        lambdacontext, 'aws_request_id') else None
    global threadpool
    global is_lambda
    threadpool = ThreadPool(context, 8)
    is_lambda = context[c.KEY_REQUEST_ID] is not None
    available_amoeba_lambdas = []
    available_amoeba_lambdas.append(c.ENV_AMOEBA_1)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_2)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_3)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_4)
    available_amoeba_lambdas.append(c.ENV_AMOEBA_5)
    db = DynamoDb(context)
    crawler = Crawler(context, os.environ[c.ENV_S3_STORAGE])
    glue = Glue()

    events = glue.get_events()
    #TODO: adjust the amoeba tree depth so that we have fully utilized all available amoebas; len(available_amoeba_lambdas) * 1000
    #since the number of leaf nodes for the metric partitions can quickly get very large we use a 5 lambda pool to ensure we don't hit the 1000 invocation limit.

    start = datetime.datetime.utcnow() - datetime.timedelta(hours=hours_delta)
    now = datetime.datetime.utcnow()

    for type in events:
        dt = start
        while dt <= now:
            prefix = metric_schema.s3_key_format().format(
                context[c.KEY_SEPERATOR_PARTITION], dt.year, dt.month, dt.day,
                dt.hour, type, dt.strftime(util.partition_date_format()))
            threadpool.add(crawler.crawl, prefix, available_amoeba_lambdas,
                           invoke_lambda)
            dt += timedelta(hours=1)

    threadpool.wait()
    return custom_resource_response.success_response({"StatusCode": 200}, "*")
Beispiel #48
0
#!/usr/bin/env python
# coding:utf-8
from thread_pool import ThreadPool
import hackhttp
import re
import os

hh = hackhttp.hackhttp(hackhttp.httpconpool(500))
tp = ThreadPool(500)
package = "wooyun"

if not os.path.exists(package):
    os.mkdir(package)


def vlun(wid):
    print "[+]%s" % wid
    if os.path.isfile(wid + ".html"):
        return
    _, _, html, _, _ = hh.http(
        url="http://wooyun.org/bugs/%s" % wid, cookcookie=False)
    open(package + "/" + wid + '.html', 'wb').write(html)


def catalog(page):
    _, _, html, _, _ = hh.http(
        url="http://wooyun.org/bugs/new_public/page/%d" % page,
        cookcookie=False)
    for wid in re.findall(r'href="/bugs/(wooyun-\d+-\d+)">', html):
        tp.add_task(vlun, wid)
    if page > 0:
            for i in range(0, len(item)):
                c = item[i].decode("gb2312")
                if i == 0:
                    l.append(c)
                else:
                    if c[0] == "&":
                        l.append(0)
                    else:
                        l.append(1)
            rooms.append(l)
        with open(
                "data/" + campus + "." + building + "." + week + "." +
                week_day + ".json", "w") as f:
            f.write(json.dumps(rooms))
        print "finish: week:" + week + " week_day:" + week_day
        return "success"


if __name__ == "__main__":
    s = Spider()
    s.cookies = {"JSESSIONID": "8B7DA565F71772D37B04170241A757A8.TAB2;"}
    pool = ThreadPool(size=20)
    pool.start()

    for week in range(1, 21):
        for week_day in range(1, 8):
            print "start week:" + str(week) + " week_day:" + str(week_day)
            # 请自行确定info.py中的校区id和教学楼id是正确的
            # 然后按照info.py中的数据修改校区和教学楼id
            pool.append_job(s.craw, "1709", "1783", str(week), str(week_day))
    pool.join()
from thread_pool import ThreadPool
from db.flights_resp import FlightsRespDAL
from flights_data.flight_checks import FlightChecker
from datetime import date

airports_error = []

def check_flights(origin, dest,  depart_date, flight_checker):
    if not flight_checker.pricer.get_price_one_way(origin, dest, str(depart_date))[0]:
        airports_error.append(dest)


pool = ThreadPool(20, "flight_checker", FlightChecker)

flight_resp_dal = FlightsRespDAL()

depart = date(2014, 8, 02)
dests = flight_resp_dal.get_all_airports()
print len(dests)
for dest in dests:
    pool.add_task(check_flights, "JFK", dest, depart)
pool.start()
pool.wait_completion()

print airports_error

for y in airports_error:
    flight_resp_dal.airport_collection.remove({"airport_code": y})
Beispiel #51
0
    # In range, start, end(plus 1 to include end) and steps
    pool_size = [x for x in range(min_thread, max_thread, thread_step)]
    # Create dict with thread sizes to keep track of time
    for thread_count in pool_size:
        times[thread_count] = []

    for i in pool_size:
        if ovrld.overloaded:
            i = ovrld.opt_work_threads
        if need_count and i == pool_size[-1] \
                or need_count and ovrld.overloaded and i == ovrld.opt_work_threads:
            clients = i * calculate_needed()
        else:
            clients = i * count

        pool = ThreadPool(i)

        # Clients is the final goal, it'll run the thread count for "count" iterations
        # count is from config
        sched_clients += clients
        while clients:
            # Change to your desired function...
            pool.add_task(time_event, xmlrpc_call, i)
            clients -= 1
            total_clients += 1
            if errors.error_count > errors_threshold:
                quit()
        pool.wait_completion()
        avg_time = sum(times[i]) / len(times[i])
        ovrld.calc_time(avg_time, i)
                    "submissions": self.re_search("<td>Submissions</td><td align=center>(\d+)</td>", html),
                    "accepted": self.re_search("<td>Accepted</td><td align=center>(\d+)</td>", html)}
        except Exception as e:
            logging.error(e)
            return {"website": "hduoj",
                    "rank": 0, "problems_submitted": 0,
                    "problems_solved": 0, "submissions": 0,
                    "accepted": 0}


d = DBHandler()
s = Spider()

user_list = d.get_user_list()

pool = ThreadPool(size=10)
pool.start()


def add_username(func, username, oj_username):
    data = func(oj_username)
    data["username"] = username
    return data


for user in user_list:
    pool.append_job(add_username, s.bestcoder, user[0], user[1])
    pool.append_job(add_username, s.codefoces, user[0], user[2])
    pool.append_job(add_username, s.hduoj, user[0], user[3])
pool.join()
pool.stop()
Beispiel #53
0
class MessageBus(object):
    """ 消息总线
        用于发送消息和桥接bot和命令
        接收消息分发给群成员
        处理消息命令,指派给相应的命令处理
        供命令处理返回命令或广播命令结果
    """
    def __init__(self, bot_jid, stream):
        self.bot_jid = bot_jid
        self._stream = stream
        self.cmd_handler = CommandHandler(message_bus = self)
        self.admin_cmd_handler = AdminCMDHandler(message_bus = self)
        self._thread_pool = ThreadPool(5)
        self._thread_pool.start()         # 启动线程池
        self.logger = get_logger()
        return

    def make_message(self, to, typ, body):
        """ 构造消息
            `to` - 接收人 JID
            `typ` - 消息类型
            `body` - 消息主体
        """
        if typ not in ['normal', 'chat', 'groupchat', 'headline']:
            typ = 'normal'
        m = Message(from_jid = self.bot_jid, to_jid = to, stanza_type = typ,
                    body = body)
        return m

    def send_to_admin(self, stanza, body):
        """ 给管理员发送消息 """
        [self.send_message(stanza, admin, body, True) for admin in ADMINS]

    def send_private_msg(self, stanza, to, body):
        """ 发送私信 """
        frm = stanza.from_jid
        nick = get_nick(frm)
        body = "[%s 悄悄对你说] %s" % (nick, body)
        self.send_message(stanza, to, body, True)

    def send_message(self, stanza, to, body, log = False):
        """ 发送消息
            `stanza`   - 消息节
            `to`       - 接收人 接收人不在线发送离线消息
            `body`     - 消息主体
            `log`      - 记录历史消息
        """
        if log:
            add_history(stanza.from_jid, to, body)
        if is_online(to):
            mode = get_info('mode', to)
            if mode == 'talk' or not mode:
                if isinstance(to, (str, unicode)):
                    to = JID(to)
                self.logger.debug("send '{0}' to {1!r}".format(body, to))
                typ = stanza.stanza_type
                self._stream.send(self.make_message(to, typ, body))
        else:
            body = NOW() + ' ' + body
            self.logger.debug("store offline message'{0}' for {1!r}"
                                    .format(body, to))
            offline_message = get_info('offline_message', to, '')
            offline_message += '\n' +  body
            add_info('offline_message', offline_message, to)

    def send_offline_message(self, stanza):
        """ 发送离线消息 """
        show = stanza.show
        frm = stanza.from_jid
        offline_message = get_info('offline_message', frm)
        if offline_message:
            offline_message = "离线期间的消息:\n" + offline_message
            m = self.make_message(frm, 'normal', offline_message)
            self._stream.send(m)
            set_online(frm, show)
            add_info('offline_message', '', frm)

    def send_all_msg(self, stanza, body):
        """ 给除了自己的所有成员发送消息 """
        if cityid(body.strip()):
            return self.send_command(stanza, '-_tq ' + body.strip())
        if body.strip() == 'help':
            return self.send_command(stanza, '-help')
        if body.strip() == 'ping':
            return self.send_command(stanza, '-_ping')
        mode = get_info('mode', stanza.from_jid)
        if mode == 'quiet':
            body = u'你处于{0},请使用-cd命令切换到 {1} '\
                    u'后发言'.format(MODES[mode], MODES['talk'])
            return self.send_back_msg(stanza, body)

        add_history(stanza.from_jid, 'all', body)
        members = get_members(stanza.from_jid)
        current = get_info('channel', stanza.from_jid, 'main')
        members = [m for m in members
                   if get_info('channel', m, 'main') == current]
        self.logger.info("{0} send message {1} to {2!r}"
                            .format(stanza.from_jid, body, members))
        nick = get_nick(stanza.from_jid)
        body = "[{0}] {1}".format(nick, body)
        [self.send_message(stanza, m, body) for m in members]

    def send_back_msg(self, stanza, body):
        """ 发送返回消息 """
        to = stanza.from_jid.bare().as_string()
        typ = stanza.stanza_type
        self._stream.send(self.make_message(to, typ, body))

    def send_sys_msg(self, stanza, body):
        """ 发送系统消息 """
        members = get_members()
        [self.send_message(stanza, m, body) for m in members]

    def send_command(self, stanza,  body):
        """ 处理命令
            为防止阻塞使用线程池处理命令
        """
        email = get_email(stanza.from_jid)
        self.logger.info("{0} run command {1}".format(stanza.from_jid, body))
        if email in ADMINS:
            target = self.admin_cmd_handler._run_cmd
        else:
            target = self.cmd_handler._run_cmd
        self._thread_pool.add_job(target, stanza, body)

    def send_status(self, statustext, to = None):
        if to:
            to_jid = JID(to)
            p = Presence(status=statustext, to_jid = to_jid)
        else:
            p = Presence(status = statustext)
        self._stream.send(p)

    def send_subscribe(self, jid):
        """ 发送订阅 """
        p1 = Presence(from_jid = self.bot_jid, to_jid = jid,
                      stanza_type = 'subscribe')
        p = Presence(from_jid = self.bot_jid, to_jid = jid,
                     stanza_type = 'subscribed')
        self._stream.send(p)
        self._stream.send(p1)

    def send_unsubscribe(self, jid):
        p1 = Presence(from_jid = self.my_jid, to_jid = jid,
                      stanza_type = 'unsubscribe')
        p = Presence(from_jid = self.my_jid, to_jid = jid,
                     stanza_type = 'unsubscribed')
        self._stream.send(p)
        self._stream.send(p1)
Beispiel #54
0
def entities():
    error = None
    if request.method == 'POST':
        query = request.form['query'].lower()
        query = query.replace(" ", "%20")
        if query in app.entity_cache:
            return Response(json.dumps(app.entity_cache[query]), mimetype='application/json')
        url = GOOGLE_NEWS_RSS+query
        response = requests.get(url).text

        # Get article links from the RSS
        links = []
        utils.find_links(links, response)
        if len(links) > 1:
            links = links[2:]

        # Get article titles from the RSS
        titles = []
        utils.find_titles(titles, response)
        if len(titles) > 1:
            titles = titles[2:]

        num_links = len(links)
        entities = []
        pool = ThreadPool(num_links)
        for i in range(num_links):
            pool.add_task(
                utils.get_article_entities, links[i], entities)
        pool.wait_completion()

        result = {}
        num_entities = len(entities)


        entity_dict = defaultdict(int)
        for entity_list in entities:
            for entity in entity_list:
                entity_dict[entity] += 1
        entities_list = reversed(list(sorted(entity_dict.items(), key=lambda x: x[1])))
        entities_list = [x[0] for x in entities_list]
        if len(entities_list) > 5:
            entities_list = entities_list[:5]

        fix_entities = False
        if not entities_list:
            fix_entities = True

        if num_entities > 0:
            entities_set = entities[0]
            if fix_entities:
                if len(entities[0]) > 3:
                    entities_list = [word.title() for word in list(entities[0])[:3]]
                else:
                    entities_list = [word.title() for word in entities[0]]
            result[0] = [word.title() for word in entities[0]]
            for i in range(1, num_entities):
                if fix_entities and len(entities_list) < 3:
                    if len(entities[i]) + len(entities_list) > 3:
                        entities_list.extend([word.title() for word in list(entities[i])[:3-len(entities_list)]])
                    else:
                        entities_list.extend([word.title() for word in entities[i]])
                result[i] = [word.title() for word in entities[i]]

        result["entities"] = entities_list
        app.entity_cache[query] = deepcopy(result)


        return Response(json.dumps(result), mimetype='application/json')
class Crawler(object):

    def __init__(self, args):
        self.thread_num = args.thread_num
        self.output = args.output
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.domain_pattern = re.compile(
            r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$")

    def _init(self):
        # 线程池,指定线程数
        self.thread_pool = ThreadPool(self.thread_num)
        self.depth = 2
        # 标注初始爬虫深度,从1开始
        self.current_depth = 1
        # 已访问的链接
        self.visited_hrefs = set()
        # 待访问的链接
        self.unvisited_hrefs = deque()
        # 标记爬虫是否开始执行任务
        self.is_crawling = False
        self.resource_details = ResourceDetailCollection()

    def _format_url(self, raw_value):
        raw_value_str = raw_value.strip().strip('\n')
        if len(raw_value_str) <= 0:
            return ''
        if not self.domain_pattern.match(raw_value_str):
            return ''
        if not raw_value_str.startswith('http'):
            value = 'http://' + raw_value_str
        else:
            value = raw_value_str
        return value

    def crawl(self, url):
        self._init()
        formatted_url = self._format_url(url)
        self.resource_details.set_main_frame_url(formatted_url)
        self.unvisited_hrefs.append(formatted_url)
        print '\nStart Crawling url %s\n' % formatted_url
        self.is_crawling = True
        self.thread_pool.start_threads()
        while self.current_depth < self.depth + 1:
            # 分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
            self._assigin_current_depth_tasks()
            # 等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
            # self.thread_pool.task_join()可代替以下操作,可无法Ctrl-C Interupt
            while self.thread_pool.get_task_left():
                time.sleep(8)
            print 'Depth %d Finish. Totally visited %d links. \n' % (
                self.current_depth, len(self.visited_hrefs))
            log.info('Depth %d Finish. Total visited Links: %d\n' % (
                self.current_depth, len(self.visited_hrefs)))
            self.current_depth += 1
        # After finishing all the tasks, stop this crawling.
        print "all Tasks has finished"
        self._on_all_tasks_finished()
        self.stop()

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stop_threads()

    def get_already_visited_num(self):
        # visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        # 因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visited_hrefs) - self.thread_pool.get_task_left()

    def _on_all_tasks_finished(self):
        resource_detail_data = unicode(json.dumps(
            self.resource_details.to_json_data(), indent=4))
        hashed_file_name = hashlib.new("md5",
                                       self.resource_details.main_frame_url).hexdigest() + ".json"
        resource_detail_dataPath = os.path.join(self.output, hashed_file_name)
        with io.open(resource_detail_dataPath, 'w') as file:
            file.write(unicode(resource_detail_data))

    def _assigin_current_depth_tasks(self):
        mylock.acquire()
        copied_unvisited_hrefs = deque()
        while self.unvisited_hrefs:
            copied_unvisited_hrefs.append(self.unvisited_hrefs.popleft())
        mylock.release()
        while copied_unvisited_hrefs:
            url = copied_unvisited_hrefs.popleft()
            # 标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visited_hrefs.add(url)
            # 向任务队列分配任务
            self.thread_pool.put_task(self._task_handler, url)

    def _task_handler(self, url):
        # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        url_fetcher = URLFetcher(url)
        retry = 1
        if url_fetcher.fetch(retry):
            self._save_task_results(url, url_fetcher)
            self._add_unvisited_hrefs(url_fetcher)

    def _save_task_results(self, url, url_fetcher):
        print 'Visited URL : %s \n' % url
        response_headers = url_fetcher.get_response_headers()
        response_detail = ResourceDetail(url,
                                         url_fetcher.request_time,
                                         url_fetcher.response_time,
                                         response_headers)
        mylock.acquire()
        self.resource_details.add_detail(response_detail)
        mylock.release()

    def _add_unvisited_hrefs(self, url_fetcher):
        '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表'''
        # 对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        url, page_source = url_fetcher.get_data()
        hrefs = self.get_all_resource_hrefs(url, page_source)
        mylock.acquire()
        for href in hrefs:
            if self._is_http_or_https_protocol(href):
                if not self._is_href_repeated(href):
                    self.unvisited_hrefs.append(href)
        mylock.release()

    def get_all_resource_hrefs(self, url, page_source):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(page_source)
        results = soup.find_all(True)

        for tag in results:
            href = None
            if tag.name == 'a':
                continue
            # 必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf
            # 在bs4中不会被自动url编码,从而导致encodeException
            if tag.has_attr('href'):
                href = tag.get('href').encode('utf8')
            elif tag.has_attr('src'):
                href = tag.get('src').encode('utf8')
            if href is not None:
                if not href.startswith('http'):
                    href = urljoin(url, href)  # 处理相对链接的问题
                hrefs.append(href)
        return hrefs

    def _is_http_or_https_protocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def _is_href_repeated(self, href):
        if href in self.visited_hrefs or href in self.unvisited_hrefs:
            return True
        return False
Beispiel #56
0
def main():

    url_task_queue = Queue.Queue()
    thread_searcher_pool = ThreadPool(500, url_task_queue)
    thread_searcher_pool.start()
    topic_searcher_urls_pool = ThreadPool(200)
    topic_searcher_urls_pool.start()
    topic_threader = TopicSearcherThreader(
        thread_searcher_pool,
        topic_searcher_urls_pool,
        build_url_job,
        HenHenLuTopicUrlExer,
        BASE_URL,
        START_NUM,
        END_NUM,
        PIC_TYPE,
        )
    topic_threader.start()

    while True:
        print '#######################################'
        print 'URL has task count: %s' \
            % str(thread_searcher_pool.current_task_count())
        print '#######################################'
        print '#######################################'
        print 'TOPIC has task count: %s' \
            % str(topic_searcher_urls_pool.current_task_count())
        print '#######################################'

        time.sleep(5)

    topic_threader.join()
    topic_searcher_urls_pool.wait_done()
    thread_searcher_pool.wait_done()
Beispiel #57
0
            l = []
            for i in range(0, len(item)):
                c = item[i].decode("gb2312")
                if i == 0:
                    l.append(c)
                else:
                    if c[0] == "&":
                        l.append(0)
                    else:
                        l.append(1)
            rooms.append(l)
        f = open("data/" + campus + "." + building + "." + week + "." + week_day + ".json", "w")
        f.write(json.dumps(rooms))
        f.close()
        print "finish: week:" + week + " week_day:" + week_day
        return "success"


if __name__ == "__main__":
    s = Spider()
    s.cookies = {"JSESSIONID": "8B7DA565F71772D37B04170241A757A8.TAB2;"}
    pool = ThreadPool(size=20)
    pool.start()

    for week in range(1, 21):
        for week_day in range(1, 8):
            print "start week:" + str(week) + " week_day:" + str(week_day)
            # 请自行确定info.py中的校区id和教学楼id是正确的
            # 然后按照info.py中的数据修改校区和教学楼id
            pool.append_job(s.craw, "1709", "1783", str(week), str(week_day))
    pool.join()
Beispiel #58
0
class Crawler():
    def __init__(self, myconfig):
        # 线程池, 
        self.thread_pool = ThreadPool(myconfig.threadnum)
        # 已访问的url集合
        self.visited_urls = set()
        # set 不是线程安全,所以这里加一把锁
        self.visited_urls_lock = threading.Lock()
        # 未访问的url集合
        self.will_visited_urls = deque()
        self.will_visited_urls.append(myconfig.url)
        self.temp_q = deque()
        self.cur_depth = 0
        self.status = ""
        self.myconfig = myconfig
        MyLogger(myconfig.logfile, myconfig.loglevel)
        #MyLogger(myconfig.logfile, loglevel = 5)  # debug
        self.db = Db()
        
    
    def start(self):
        self.status = "start"
        while self.cur_depth < self.myconfig.depth:
            if self.status == "stop":
                break
            try:
                while self.will_visited_urls:
                    url = self.will_visited_urls.popleft()
                    # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责
                    # 添加工作,真正执行工作是在线程里做的
                 
                    self.thread_pool.add_job(self.handler, url)
                #
                # TODO:
                # 通知线程有活干了,这里可以看出是在将will_visited_urls的url
                # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗?
                # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知
                # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都
                # 添加完之后,再通知
                
                #print ">>>>>>>>  give event to threads in thread pool"
                # 通知线程池里的线程开始新一轮的抓取
                self.thread_pool.event_do_job()
                # 主动退出调度,让子线程有时间可以执行
                time.sleep(3)
            except Empty:
                # 需要访问的url没有了
                logging.info("no url right now")
            finally:
                
                # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束
                # 这里做的处理是如果线程池里面有线程,则睡3s,再读,
                # 直到线程池里的工作线程为0才停下来
                # 这样才算本次深度的抓取完毕
                while True:
                    #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread)
                    if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum:
                        # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕
                        # 所以break
                        break
                    else:
                        # 有线程仍然在执行,则说明, 本次深度的访问还没有结束
                        # 睡眠等待
                        time.sleep(10)
                #此次深度的访问结束,深度加一
                self.cur_depth += 1
                logging.info("crawler depth now is %s" % str(self.cur_depth))
                if self.cur_depth > self.myconfig.depth:
                    break
                # 从url中抓到的网页都放到了temp_q中,
                # 将temp_q中的网页从新给 will_visited_urls,继续
                self.will_visited_urls = self.temp_q
                self.temp_q = deque()
                
                
        # 所有深度的url都抓取完毕 or 爬虫退出
        self.thread_pool.stop_threads()
        logging.info("crawler exit")
        return
        
            
    def handler(self, url):
        content= self.get_html_content(url)
        if content == "" or content == None:
            # 无法获取content,直接返回
            return
        # 添加此url为已访问过
        self.add_url_to_visited(url)
        if content.find(self.myconfig.key) != -1:
            self.db.save_data(url, self.myconfig.key, content)
        try:
            hrefs = self.get_hrefs(content, url)
        except StandardError, se:
            logging.error("error: %s" % (se))
            print se
            # log
            # 无法获取 hrefs
            return
        # 如果获得了hrefs
        if hrefs:
            # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问
            for link in hrefs:
                # 最后的考验
                if not self.is_url_visited(link) \
                            and link not in self.will_visited_urls \
                            and link not in self.temp_q:
                    #print "put %s into temp_q" % link 
                    self.temp_q.append(link)
Beispiel #59
0
# Get article titles from the RSS
titles = []
utils.find_titles(titles, response)
if len(titles) > 2:
    titles = titles[2:]

num_links = len(links)
num_titles = len(titles)
if num_titles < num_links:
    links = links[:num_titles]
if num_links < num_titles:
    titles = titles[:num_links]

articles = {}
pool = ThreadPool(num_links)
for i in range(num_links):
    pool.add_task(
        utils.get_article_sentiment, links[i], titles[i], articles)
pool.wait_completion()

sentiments = []

result = {}
result["articles"] = []
for key in articles:
    info = {}
    info["title"] = articles[key]["title"]
    info["link"] = key
    sentiments.append(articles[key]["sentiment"])
    info["sentiment"] = articles[key]["sentiment"]
Beispiel #60
0
 def _init_threadpool(self):
     self.pool = ThreadPool(int(self.thread_pool_num), int(self.queue_size))