Exemple #1
0
 def port_test(self, port):
     #start spec port testing.
     sub_record = []
     #this shuould using map to reduce time usage.
     pool = ThreadPool(len(self.urls))
     results = pool.map(self.url_test, self.urls)
     return sum(results)
    async def scrawl(self, threads=5):

        logger.log('Scrawling Trackemon..', 'green')
        await self.client.wait_until_ready()

        # get arrays channels id need to post
        shout_out_channels = []
        for server in self.client.servers:
            for channel in server.channels:
                if channel.name in self.config.get('scrawl_channels', []):
                    shout_out_channels.append(discord.Object(channel.id))

        if len(shout_out_channels) == 0:
            raise Exception("No channel to shout out!")

        while not self.client.is_closed:
            logger.log('Scrawling Trackemon..', 'green')

            self._retrieve_session_id()

            # use multiprocessing
            if 'pokemons' in self.config.get('scrawl_trackemon'):
                pokemon_names = self.config.get('scrawl_trackemon')['pokemons']

                pool = ThreadPool(threads)
                messages = pool.starmap(self.scrawl_trackemon, zip(
                    pokemon_names, itertools.repeat(self.session_id)))

                for message in messages:
                    if len(message):
                        for channel in shout_out_channels:
                            await self.client.send_message(channel, message)

            # increase delay to finish task
            await asyncio.sleep(self.config.get('delay_scrawl', 300))
	def __init__(self,records,outfile):
		#cat_xml = "../itma.cat.soutron_20160216.xml"
		print 'Parsing XML data...'
		#records = etree.parse(cat_xml)
		self.counter = 0
		self.roles = ['performer']
		self.locations = []
		cfields = []
		refnos = []

		self.records = records
		[[cfields.append(y) for y in x.xpath('*[self::Creator or self::Contributors]')] for x in records]
		[[self.locations.append(y) for y in x.xpath('GeographicalLocation/text()')] for x in records]
		
		map(self.parse_roles,cfields)
		self.roles = list(set([x.replace('\n','').strip() for x in filter(lambda x:len(x) > 1,self.roles)]))
		self.locations = list(set(self.locations))

		self.role_list = etree.Element("NamedRoles")

		self.cache = {}

		print 'extracting roles...'
		pool = Pool(processes=4)
		pool.map(self.process_recordlist,records)
			
		return etree.ElementTree(self.role_list).write(outfile,pretty_print=True)
 def assync_users_proceed(self, users_pool, threads):
     pool = ThreadPool(threads)
     try:
         full_users = pool.map(self.get_user_info, users_pool)
     except Exception, e:
         print e
         full_users = []
def download_urls_to_zip(zf, urls):

    urls = set(urls)

    pool = ThreadPool(10)
    download_to_zip_func = lambda url: download_url_to_zip(zf, url)
    pool.map(download_to_zip_func, urls)
Exemple #6
0
 def generate_k_clusters(self,folder,size):
   pool = ThreadPool(cpu_count())
   self.size = size
   result = pool.map(self.read_image, folder)
   self.cluster = [r[0] for r in result if r[0]]
   self.data = [r[1] for r in result if r[1]]
   self.end = [r[2] for r in result if r[2]]
Exemple #7
0
def test_multi_threading():
    import time
    import random
    from multiprocessing.dummy import Pool

    def op_a(a, b):
        time.sleep(random.random()*.02)
        return a+b

    def op_b(c, b):
        time.sleep(random.random()*.02)
        return c+b

    def op_c(a, b):
        time.sleep(random.random()*.02)
        return a*b

    pipeline = compose(name="pipeline", merge=True)(
        operation(name="op_a", needs=['a', 'b'], provides='c')(op_a),
        operation(name="op_b", needs=['c', 'b'], provides='d')(op_b),
        operation(name="op_c", needs=['a', 'b'], provides='e')(op_c),
    )

    def infer(i):
        # data = open("616039-bradpitt.jpg").read()
        outputs = ["c", "d", "e"]
        results = pipeline({"a": 1, "b":2}, outputs)
        assert tuple(sorted(results.keys())) == tuple(sorted(outputs)), (outputs, results)
        return results

    N = 100
    for i in range(20, 200):
        pool = Pool(i)
        pool.map(infer, range(N))
        pool.close()
def put_from_manifest(
    s3_bucket,
    s3_connection_host,
    s3_ssenc,
    s3_base_path,
    aws_access_key_id,
    aws_secret_access_key,
    manifest,
    bufsize,
    concurrency=None,
    incremental_backups=False,
):
    """
    Uploads files listed in a manifest to amazon S3
    to support larger than 5GB files multipart upload is used (chunks of 60MB)
    files are uploaded compressed with lzop, the .lzo suffix is appended
    """
    bucket = get_bucket(s3_bucket, aws_access_key_id, aws_secret_access_key, s3_connection_host)
    manifest_fp = open(manifest, "r")
    buffer_size = int(bufsize * MBFACTOR)
    files = manifest_fp.read().splitlines()
    pool = Pool(concurrency)
    for _ in pool.imap(
        upload_file, ((bucket, f, destination_path(s3_base_path, f), s3_ssenc, buffer_size) for f in files)
    ):
        pass
    pool.terminate()

    if incremental_backups:
        for f in files:
            os.remove(f)
Exemple #9
0
def main():
    pool = ThreadPool(4)
    terms_to_articles = {}

    t0 = time()

    for term in search_terms:
        print("Getting articles for {}...".format(term))
        article_urls = get_articles_urls_for(term)
        articles = pool.map(get_article, article_urls)
        terms_to_articles[term] = articles

    print("Fetching articles took {:.1f} seconds".format(time() - t0))

    for term in search_terms:
        articles = terms_to_articles[term]
        print("Articles for {} ({}):".format(term, len(articles)))
        for article in articles:
            print(u"  == {} ==".format(article.title))
            print(u"  {}...".format(article.text[:70]))
            print(u"  - {}".format(article.url))
            print

    with open('articles.pickle', 'wb') as f:
        pickle.dump(terms_to_articles, f)
def test_upload_chunk__expired_url():
    upload_parts = [{'uploadPresignedUrl': 'https://www.fake.url/fake/news',
                     'partNumber': 420},
                    {'uploadPresignedUrl': 'https://www.google.com',
                     'partNumber': 421},
                    {'uploadPresignedUrl': 'https://rito.pls/',
                     'partNumber': 422},
                    {'uploadPresignedUrl': 'https://never.lucky.gg',
                     'partNumber': 423}
                    ]

    value_doesnt_matter = None
    expired = Value(c_bool, False)
    mocked_get_chunk_function = MagicMock(side_effect=[1, 2, 3, 4])

    with patch.object(multipart_upload, "_put_chunk",
                      side_effect=SynapseHTTPError("useless message",
                                                   response=MagicMock(status_code=403))) as mocked_put_chunk, \
         patch.object(warnings, "warn") as mocked_warn:
        def chunk_upload(part):
            return _upload_chunk(part, completed=value_doesnt_matter, status=value_doesnt_matter, syn=syn,
                                 filename=value_doesnt_matter, get_chunk_function=mocked_get_chunk_function,
                                 fileSize=value_doesnt_matter, partSize=value_doesnt_matter,
                                 t0=value_doesnt_matter, expired=expired, bytes_already_uploaded=value_doesnt_matter)
        # 2 threads both with urls that have expired
        mp = Pool(4)
        mp.map(chunk_upload, upload_parts)
        assert_true(expired.value)

        # assert warnings.warn was only called once
        mocked_warn.assert_called_once_with("The pre-signed upload URL has expired. Restarting upload...\n")

        # assert _put_chunk was called at least once
        assert_greater_equal(len(mocked_put_chunk.call_args_list), 1)
Exemple #11
0
def BurstDz(host, path, user, passfile):
    hostuser = host.split('.')
    hostuser = hostuser[len(hostuser)-2]
    hostdir = [hostuser,hostuser+hostuser,'admin'+hostuser,hostuser+'123','manage'+hostuser,hostuser+'123456',hostuser+'admin','123'+hostuser]

    opts_list = []

    f = open(passfile, 'r')
    password = f.read().split()
    dic = password+hostdir
    pool = ThreadPool(10)
    host1 = host+path

    for x in range(len(dic)):
        mima = dic[x]
        opts = {
            'host': host1,
            'user': user,
            'password': mima
        }
        opts_list.append(opts)

    #print hostr
    #print result
    pool.map(LoginDisCuz, opts_list)
    #pool.join()
    print 'All PassWord Run Over'
def format_data(dealer_data):
     start_time = time.time()
     pool = Pool(1)
     dealers=[]
     today = datetime.now()
     for data in dealer_data:
         temp = {}
         temp['id'] = data[0]
         temp['service_advisor_id'] = data[1]
         temp['name'] = data[2]
         temp['phone_number'] = data[3]
         temp['order'] = data[4]
         temp['password'] = data[1]+'@123'
         temp['last_login'] = today
         temp['is_superuser'] = 0
         temp['username'] = data[1]
         temp['first_name'] = ' '
         temp['last_name'] = ' '
         temp['email'] = ''
         temp['is_staff'] = 0
         temp['is_active'] = 1
         temp['date_joined'] = today
         dealers.append(temp)
     pool.map(process_query, dealers)
     end_time = time.time()
     print "..........Total TIME TAKEN.........", end_time-start_time
Exemple #13
0
    def _get_item_data(self, itemids, threads=-1):
        """
        Get metadata for many items.

        :param itemids: item numbers
        :param threads: number of cpu threads to use

        :type itemids: list
        :type threads: int
        """
        self.info('getting data')
        self.info('threads = %d', threads)

        # threads make it faster but I've seen it freeze so disabling this for now
        if threads > 1:
            threads = 0
            self.error('multiprocessing seems fishy')
            self.error('setting threads=1')

        # get data from itemids
        if threads > 1:
            from multiprocessing.dummy import Pool as ThreadPool
            import itertools
            params = zip(itemids, range(len(itemids)), itertools.repeat(len(itemids)))
            pool = ThreadPool(threads)
            data = pool.map(self._get_item_data_for_itemid_map, params)
            data = {d['itemid'] : d for d in data}
        else:
            data = {}
            for i, itemid in enumerate(itemids):
                data[itemid] = self._get_item_data_for_itemid(itemid, index=i, total=len(itemids))

        return data
Exemple #14
0
 def getAllProducts(self):
     """
     multithreaded
     returns a dictionary of information
         {skus}
         skus is a dictionary with many keys and values
         refer to output.txt to see what information it holds
     """
     skus = {}
     page = 1
     num_pages = 8
     r = None
     found_empty = False
     pool = ThreadPool(num_pages)
     while not found_empty:
         pages = range(page, page + num_pages)
         results = pool.map(lambda x: self._listProducts(page=x), pages)
     # print(results)
         for r in results:
             if str(r.status_code) == "204":
                 found_empty = True
                 break
             if str(r.status_code).startswith("4"):
                 raise Exception("Error {}: {}.".format(r.status_code, BigCommerce.error_codes[int(r.status_code)]))
             temp_data = r.json()
             for item in temp_data:
                 sku = item["sku"]
                 skus[sku] = item
             page += 1
     return {"skus": skus}
Exemple #15
0
def update_proxy_pool(test_url, timeout, proxy_pool, ready_flag, interval):
    """
    守护进程执行的任务,定时更新代理池。
    注意每次更新本身需要十几秒的时间,
    所谓定时,是规定更新间隔时间。
    """
    while 1:
        proxy_list = get_proxies(test_url, timeout)  # 获取新代理列表
        # 筛选不在新代理列表中的旧代理
        pre_test_list = proxy_pool.keys()
        pre_test_list.remove(None)
        for proxy in proxy_list:
            if proxy in proxy_pool:  # 如果该旧代理在新代理列表中,不测试该代理
                pre_test_list.remove(proxy)
        # 测试旧代理,弃用响应太慢的旧代理
        if len(pre_test_list) > 0:
            pool = Pool(16)  # 创建线程池
            kwargs = [{'test_url': test_url, 'proxy': proxy, 'timeout': timeout} for proxy in pre_test_list]  # 封装参数
            response_time_list = pool.map(multi_test_wrapper, kwargs)  # 并行测试
            for i in xrange(len(pre_test_list)):  # 弃用响应太慢的旧代理
                if response_time_list[i] > timeout:
                    del proxy_pool[pre_test_list[i]]
        # 合并新旧代理列表
        for proxy in proxy_list:
            if proxy not in proxy_pool:  # 如果新代理不在代理池中,初始化新代理
                proxy_pool[proxy] = 0
        ready_flag.value = True
        print('代理池更新完成,当前代理池中有', len(proxy_pool), '个代理')
        sleep(interval)  # 定时更新一次代理列表
Exemple #16
0
def main():
    mht_list = get_mht_list()
    if not mht_list:
        print u'请确保目录下有mht文件\n'
        return
    print u'共有%s个mht文件中的图片需要备份\n'%len(mht_list)

    print u'请输入你的QQ号码(6-10位纯数字):'
    qq = raw_input()
    print u'正在搜索mht文件中待备份图片,请稍后....'
    get_mht_pic(mht_list)
    if not mht_pic_md5:
        print u'mht文件中未包含可备份图片\n'
        return
    print u'共找到%s张待备份图片'%len(mht_pic_md5)
    # QQ图片文件夹
    documents_path = os.getenv('USERPROFILE') + os.sep + '\Documents'
    img_path = documents_path + os.sep + 'Tencent Files/' + qq + '/Image'
    print u'正在统计QQ聊天记录图片, 请稍后....'
    pic_list = get_pic_list(img_path)
    if not pic_list:
        print u'未找到QQ聊天记录图片文件夹,请确保输入了正确的QQ号码\n'
        main()
    
    pool = ThreadPool(thread_num)
    print u'正在备份....'
    pool.map(backup, pic_list)
    print u'备份完成\n图片保存在当前路径的bak文件夹下\n'
def runLocalCommands(args, outputDir, commands):
    # NOTE: this is going to BREAK meff optimisation if we re-cycle histograms.
    # Needs to be updated to run in successive orde if we implement that.
    N = len(commands)

    if N > 50:
        print("")
        print("Are you sure you want to run %d commands locally?" % N)
        if args.dry_run:
            print("[NB: this is a dry run]")
        var = input("Press enter to continue")
        print("")

    cmds = []
    for i, x in enumerate(commands):
        (cuts, name, cmd) = x
        cmd = "cd %s && echo '%d/%d\t%s' && %s 2>&1 >/dev/null" % (outputDir, i+1, N, cmd, cmd)
        cmds.append(cmd)

    if args.dry_run:
        print("Would run following commands:")
        for cmd in cmds:
            print("   %s" % cmd)
        return

    pool = Pool(10) # concurrent commands at a time
    for i, returncode in enumerate(pool.imap(partial(subprocess.call, shell=True), cmds)):
        if returncode != 0:
           print(("%d command failed: %d" % (i, returncode)))
def test_generate_in_progress_resizer_option_true(
    redis_cache,
    resizetarget_opts,
    image1_data,
    image1_name,
    tmpdir
):
    config = Config(
        root=str(tmpdir),
        url='/',
        redis_host=redis_cache.redis,
        raise_on_generate_in_progress=True
    )
    resizer = flask_resize.make_resizer(config)

    # Save original file
    resizer.storage_backend.save(image1_name, image1_data)

    def run(x):
        return resizer(image1_name)

    pool = Pool(2)

    with pytest.raises(flask_resize.exc.GenerateInProgress):
        pool.map(run, [None] * 2)
Exemple #19
0
    def test_multithread(self):
        logger = HitLogger(300)

        def single_thread_process(logger):
            time.time = mock.Mock(return_value=0)
            logger.log_hit()
            time.sleep(1)
            time.time = mock.Mock(return_value=10)
            logger.get_hits()
            logger.log_hit()
            logger.get_hits()
            logger.log_hit()
            time.sleep(1)
            time.time = mock.Mock(return_value=11)
            logger.get_hits()
            time.sleep(1)
            time.time = mock.Mock(return_value=100)
            logger.log_hit()
            time.sleep(1)
            time.time = mock.Mock(return_value=200)
            logger.log_hit()
            logger.get_hits()
            time.sleep(1)
            time.time = mock.Mock(return_value=300)
            logger.log_hit()
            logger.get_hits()
            time.sleep(1)
            time.time = mock.Mock(return_value=310)
            logger.get_hits()

        pool = Pool(5)
        pool.map(single_thread_process, [ logger ] * 5)

        input('Press any key to exit...')
Exemple #20
0
def run(node):
    """
    Primary entry-point for running this module.

    :param node: dict
    {
        "url": "https://some-site.com"
    }

    :return:
    {
        document_url: metadata,
        ...
    }
    :rtype:  dict
    """
    mapper    = lambda x: redis_load(x, r)
    url       = node.get('url', 'http://www.cic.gc.ca')
    pool      = ThreadPool(32)
    docs      = redis_docs(url, r)
    metadata  = pool.map(mapper, docs)
    return {
        url2pathname(k): v
            for k,v in metadata if v
    }
Exemple #21
0
    def test_threading(self):
        pool = ThreadPool(4)
        results = pool.map(self.parser.parse, Dictionary().version.terms)
        self.assertSetEqual({str(t) for t in results}, {'[{0}]'.format(str(t)) for t in Dictionary().version.terms})

        results = pool.map(script, Dictionary().version.terms)
        self.assertSetEqual({str(t) for t in results}, set(Dictionary().version.terms))
def multithread(function, items, extra_variable, threads=2):
    """ Takes the main function to run in parallel, inputs the variable(s) and returns the results.
    :param function: The main function to process in parallel.
    :param items: A list of strings that are passed into the function for each thread.
    :param extra_variable: One additional variable that can be passed into the function.
    :param threads: The number of threads to use. The default is 2, but the threads are not CPU core bound.
    :return: The results of the function passed into this function.
    """

    if __name__ == '__main__':

        # """ A CPU core dependent multiprocessing technique.
        # The synchronized variant, which locks the main program until a process is finished. Order is retained. """
        # pool = Pool(threads)
        # results = [pool.apply(function, args=(item, extra_variable)) for item in items]
        # pool.close()
        # pool.join()

        # """ A thread dependent multiprocessing technique. Theoretically, an unlimited number of threads can be used.
        # The synchronized variant, which locks the main program until a process is finished. Order is retained. """
        # pool = ThreadPool(threads)
        # results = [pool.apply(function, args=(item, extra_variable)) for item in items]
        # pool.close()
        # pool.join()

        """ A thread dependent multiprocessing technique. Theoretically, an unlimited number of threads can be used.
        The async variant, which submits all processes at once and retrieve the results as soon as finished. """
        pool = ThreadPool(threads)
        output = [pool.apply_async(function, args=(item, extra_variable)) for item in items]
        results = [p.get() for p in output]

        return results
Exemple #23
0
def test_thread(data_array, word_list):

    def test_update_line(line):
        if len(line) == 1:
            return line
        else:
            for i in range(len(word_list)):
                for j in range(len(line)-1):
                    if line[j] == word_list[i][0] and line[j+1] == word_list[i][1]:
                        line[j] = line[j] + line[j+1]
                        line[j+1] = ''
            return line

    print data_array
    IS_MUTI_THREAD = True
    MUTI_THREAD_NUM = 3
    if IS_MUTI_THREAD:
        from multiprocessing.dummy import Pool as ThreadPool
    if IS_MUTI_THREAD:
        pool = ThreadPool(MUTI_THREAD_NUM)
        pool.map(test_update_line, data_array)
        data_array = [filter(lambda x:x!='',line) for line in data_array]
    else:
        # for i in range(len(data_array)):
            # data_array[i] = filter(lambda x:x!='', test_update_line(data_array[i]))
        data_array = [filter(lambda x:x!='', test_update_line(line)) for line in data_array]

    print data_array
def main():
    mht_list = get_mht_list()
    if not mht_list:
        print u'请确保目录下有mht文件\n'
        return
    print u'共有%s个mht文件中的图片需要备份\n'%len(mht_list)

    print u'请输入你的QQ号码(6-10位纯数字):'
    qq = raw_input()
    print u'正在搜索mht文件中待备份图片,请稍后....'
    get_mht_pic(mht_list)
    if not mht_pic_md5:
        print u'mht文件中未包含可备份图片\n'
        return
    print u'共找到%s张待备份图片'%len(mht_pic_md5)
    # QQ图片文件夹
    key = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, r'Software\Microsoft\Windows\CurrentVersion\Explorer\User Shell Folders')
    documents_path = _winreg.QueryValueEx(key, 'Personal')[0]
    img_path = documents_path + os.sep + 'Tencent Files/' + qq + '/Image'
    print u'正在统计QQ聊天记录图片, 请稍后....'
    pic_list = get_pic_list(img_path)
    if not pic_list:
        print u'未找到QQ聊天记录图片文件夹,请确保输入了正确的QQ号码\n'
        main()
    
    pool = ThreadPool(thread_num)
    print u'正在备份....'
    pool.map(backup, pic_list)
    print u'备份完成\n图片保存在当前路径的bak文件夹下\n'
Exemple #25
0
def audio_convert(filename):
    # This combines the cutting and the conversions

    cut_files = {}
    text = {}
    
    error_file = open('error.txt', 'w')
    error_file.write(filename)
    for speed in ['slow', 'fast']:
        if speed == 'slow':
            cut_files[speed] = cut_wave(filename, 0.70)
        else:
            cut_files[speed] = cut_wave(filename, 0.85) 
        # assert(False)
        pool = ThreadPool(processes = len(cut_files[speed]))
        text[speed] = pool.map(chunk_convert, cut_files[speed])
        pool.close()
        # text[speed] = [chunk_convert(x) for x in cut_files[speed]]
        print "Closed a pool"
        # Clear out the temporary files created
        for x in cut_files[speed]:
            os.remove(x)

    text = text['slow'] + text['fast']
    text = [x for x in text if len(x) > 0]
    return(text)
Exemple #26
0
def find_process_files(root_dir):
    lock = Lock()
    pool = Pool()

    hash_db = load_hashes(HASH_FILE)
    # Keep changed .pxi hashes in a separate dict until the end
    # because if we update hash_db and multiple files include the same
    # .pxi file the changes won't be detected.
    pxi_hashes = {}

    jobs = []

    for cur_dir, dirs, files in os.walk(root_dir):
        for filename in files:
            in_file = os.path.join(cur_dir, filename + ".in")
            if filename.endswith('.pyx') and os.path.isfile(in_file):
                continue
            for fromext, function in rules.items():
                if filename.endswith(fromext):
                    toext = ".c"
                    with open(os.path.join(cur_dir, filename), 'rb') as f:
                        data = f.read()
                        m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M)
                        if m:
                            toext = ".cxx"
                    fromfile = filename
                    tofile = filename[:-len(fromext)] + toext
                    jobs.append((cur_dir, fromfile, tofile, function, hash_db, pxi_hashes, lock))

    for result in pool.imap(lambda args: process(*args), jobs):
        pass

    hash_db.update(pxi_hashes)
    save_hashes(hash_db, HASH_FILE)
Exemple #27
0
def abortable_func(func, *args, **kwargs):
	"""
	The abortable_func is the wrapper function, which wraps around function type "func", call 
	  it in a background thread (multiprocessing.dummy.Thread), and terminates it after
	  "timeout" seconds.
	This function is inspired by 
	  http://stackoverflow.com/questions/29494001/how-can-i-abort-a-task-in-a-multiprocessing-pool-after-a-timeout
	  but is an improvement over the original solution, since the original solution is only 
	  applicable to a function that takes positional arguments.

	Parameters of the function:
	  func - the function that will be called and terminated if not return with "timeout" seconds
	  *args - positional arguments of "func"
	  **kwargs - named arguments of "func" + "timeout" value
	"""
	
	#- Get "timeout" value and create a ThreadPool (multiprocessing.dummy.Pool) 
	#  with only 1 worker. 
	#- Use functools.partial (https://docs.python.org/3/library/functools.html)
	#  to fit all the arguments of the func into the interface of
	#  Pool.apply_async function
	timeout = kwargs.pop('timeout', None);
	p = ThreadPool(1);
	partial_func = partial(func,**kwargs);
	res = p.apply_async(partial_func,args);

	#- Terminate the thread if it does not return after "timeout" seconds
	#  otherwise return the returned value of func
	try:
		out = res.get(timeout);
		return out
	except TimeoutError:
		p.terminate()
		return "{}:Timeout exceeded. Process terminated.\r\n".format(args[0]);
Exemple #28
0
def start():

    CSVFile(header=['Artist', 'Album', 'Genre', 'Style', 'Year', 'Rating'])
    page = 1
    page_not_found = None
    while page_not_found == None:

        try:
            print('Page', page)

            pitchfork_page = Grab()
            pitchfork_page.go(PITC_URL + str(page))
            soup = Soup(pitchfork_page.doc.select('//div[@id="main"]/ul[@class="object-grid "]').html(), 'lxml')
            albums_on_page = []

            for link in soup.find_all('a', href=True):
                albums_on_page.append('http://pitchfork.com' + link['href'])

            pool = ThreadPool(THREADS)

            pool.map(pitchfork, albums_on_page)

            page += 1

            # if page > 1:
            #   page_not_found = True

        except IndexError as error:
            print(error)
            page_not_found = True
Exemple #29
0
def main():
    n = 100000
    m = 10
    m2 = 1000
    
    create_db()

    pool = Pool(processes=5)
    start = time.time()
    fill(n)
    fill_time = time.time() - start
    print('{} inserts in {}s'.format(n,fill_time))
    db = get_db()
    print(db.directories.find().count(),'directories')

    start = time.time()
    results = []
    for _ in range(m):
        results.append(pool.apply_async(read, ()))
#        results.append(pool.apply_async(read_dataset, ()))
        for i in range(m2):
            results.append(pool.apply_async(read_one, ()))
#            if i%10 == 0:
#                results.append(pool.apply_async(fill, (1,)))
    for r in results:
        r.get(timeout=1000000)
    read_time = time.time() - start
    pool.terminate()

    print('{}.{} reads in {}s'.format(m,m2,read_time))
Exemple #30
0
def spider():
	# initialize the count
	global COUNT
	global TOTAL
	COUNT = 0


	# connect the local mongodb
	conn = pymongo.Connection( )
	db = conn.adsdata

	logfile = open("./runtime.log", "a")

	enzyme_content = KEGG.getList('enzyme')
	enzyme_lines = enzyme_content.split('\n')

	TOTAL = len(enzyme_lines)
	print('TOTAL:  ' + str(TOTAL))

	enzyme_ids = map(lambda line: line.split('\t')[0], enzyme_lines)

	## multithread inserting
	pool = ThreadPool(10)
	try:
		pool.map(lambda id:insertEnzymeTreeWith_safe(id, db), enzyme_ids )
	except Exception,e:
		print("Error: " + e.message)
Exemple #31
0

# Main
start = timeit.default_timer()

session1 = requests.Session()

response1 = session1.get('http://www.supremenewyork.com/shop/all')
soup1 = bs(response1.text, 'html.parser')
links1 = soup1.find_all('a', href=True)
links_by_keyword1 = []
for link in links1:
    for keyword in keywords_category:
        if keyword in link['href']:
            links_by_keyword1.append(link['href'])

pool1 = ThreadPool(len(links_by_keyword1))

nosession = True
while nosession:
    print('Finding matching products...')
    result1 = pool1.map(product_page, links_by_keyword1)
    for session in result1:
        if not session is None:
            nosession = False
            checkout(session)
            break

stop = timeit.default_timer()
print(stop - start)  # Get the runtime
class QiushiSpider():
    def __init__(self):
        self.url = 'https://www.qiushibaike.com/8hr/page/{}/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        }
        self.url_q = Queue()
        self.pool = Pool(5)
        self.total_response_nums = 0
        self.total_request_nums = 0
        self.is_running = True

    def makeUrlList(self):
        """构造所有的url放入队列"""
        for i in range(1, 14):
            self.url_q.put(self.url.format(i))
            self.total_request_nums += 1  # 请求数 + 1 (url数 + 1)

    def getHtml(self, url):
        """对一个url发送请求获取响应并返回"""
        resp = requests.get(url, headers=self.headers)
        return resp.text

    def parseItem(self, html_str):
        """提取一个响应中的数据,并返回多条数据构成的list"""
        html = etree.HTML(html_str)
        # 先分组,再提取
        div_list = html.xpath('//div[@id="content-left"]/div')
        result_list = []  # 构造最终返回的结果列表
        for div in div_list:
            item = {}
            item['name'] = div.xpath('.//h2/text()')[0].strip()  # 用户昵称
            item['text'] = div.xpath(
                './/div[@class="content"]/span/text()')  # 主要内容
            # print(item)
            result_list.append(item)
        # print(result_list)
        return result_list

    def saveResultList(self, result_list):
        """保存一个响应中的多条数据组成的列表"""
        # print(result_list)
        for item in result_list:
            print(item)

    def excute_requests_item_save(self):
        """从队列中拿出一个url,直到处理完成"""
        url = self.url_q.get()
        html_str = self.getHtml(url)
        result_list = self.parseItem(html_str)
        self.saveResultList(result_list)

        self.total_response_nums += 1  # 总响应数 + 1

    def _callback(self, xxx):  # callback指定的函数必须接收一个参数,哪怕用不上!
        print(xxx)
        # xxx就是excute_requests_item_save这个函数的返回值
        """apply_async异步执行的函数执行完毕后,会把该函数返回的结果作为参数传入callback指定的函数中"""
        if self.is_running:
            self.pool.apply_async(self.excute_requests_item_save,
                                  callback=self._callback)

    def run(self):
        """爬虫运行逻辑"""
        # 构造url队列
        self.makeUrlList()
        # 利用线程池中的线程异步的不断的去执行:处理一个url直到处理完毕
        for i in range(5):  # 这才是控制并发的规模!
            self.pool.apply_async(self.excute_requests_item_save,
                                  callback=self._callback)

        while 1:
            time.sleep(1)  # 一定也要睡一会儿!不然太快导致醒不过来!
            # 程序退出的逻辑: 总响应数 == url总数 --> 程序退出
            if self.total_response_nums >= self.total_request_nums:
                print('=')
                print(self.total_request_nums)
                print(self.total_response_nums)
                print(self.url_q.qsize())
                print('=')
                self.is_running = False
                break

        print('程序结束了!')
Exemple #33
0
def crawlToCSV(URLrecord):
    OpenSomeSiteURL = urllib2.urlopen(URLrecord)
    Soup_SomeSite = BeautifulSoup(OpenSomeSiteURL, "lxml")
    OpenSomeSiteURL.close()

    tbodyTags = Soup_SomeSite.find("tbody")
    trTags = tbodyTags.find_all("tr", class_="result-item ")

    placeHolder = []

    for trTag in trTags:
        tdTags = trTag.find("td", class_="result-value")
        tdTags_string = tdTags.string
        placeHolder.append(tdTags_string)

    return placeHolder


if __name__ == "__main__":
    fileName = "SomeSiteValidURLs.csv"
    pool = Pool(cpu_count() * 2)  # Creates a Pool with cpu_count * 2 threads.
    with open(FileName, "rb") as f:
        results = pool.map(
            crawlToCSV, f
        )  # results is a list of all the placeHolder lists returned from each call to crawlToCSV
    with open("Output.csv", "ab") as f:
        writeFile = csv.writer(f)
        for result in results:
            writeFile.writerow(result)
Exemple #34
0
def parallel_apply(func,
                   iterable,
                   workers,
                   max_queue_size,
                   callback=None,
                   dummy=False):
    """多进程或多线程地将func应用到iterable的每个元素中。
    注意这个apply是异步且无序的,也就是说依次输入a,b,c,但是
    输出可能是func(c), func(a), func(b)。
    参数:
        dummy: False是多进程/线性,True则是多线程/线性;
        callback: 处理单个输出的回调函数;
    """
    if dummy:
        from multiprocessing.dummy import Pool, Queue
    else:
        from multiprocessing import Pool, Queue

    in_queue, out_queue = Queue(max_queue_size), Queue()

    def worker_step(in_queue, out_queue):
        # 单步函数包装成循环执行
        while True:
            d = in_queue.get()
            r = func(d)
            out_queue.put(r)

    # 启动多进程/线程
    pool = Pool(workers, worker_step, (in_queue, out_queue))

    if callback is None:
        results = []

    # 后处理函数
    def process_out_queue():
        out_count = 0
        for _ in range(out_queue.qsize()):
            d = out_queue.get()
            out_count += 1
            if callback is None:
                results.append(d)
            else:
                callback(d)
        return out_count

    # 存入数据,取出结果
    in_count, out_count = 0, 0
    for d in iterable:
        in_count += 1
        while True:
            try:
                in_queue.put(d, block=False)
                break
            except six.moves.queue.Full:
                out_count += process_out_queue()
        if in_count % max_queue_size == 0:
            out_count += process_out_queue()

    while out_count != in_count:
        out_count += process_out_queue()

    pool.terminate()

    if callback is None:
        return results
    url = f'http://game.thronemaster.net/?game={game_log_id}&show=log'
    try:
        file_name = 'game_logs/' + str(
            url[url.find('game=') + 5:].split('&')[0])
        r = requests.get(url)
        with open(file_name, 'wb') as file:
            file.write(r.content)
    except Exception as e:
        if game_log_id % 100 == 0:
            print('FAILED')
            print(e)
    time.sleep(0.33)
    if game_log_id % 100 == 0:
        print('finished')
    return


def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f


if __name__ == '__main__':
    pool = ThreadPool(4)
    game_ids = range(80000, 140000)
    downloaded_logs = listdir_nohidden(os.getcwd() + '/game_logs/')
    downloaded_logs = [int(log) for log in downloaded_logs]
    game_ids = [x for x in game_ids if x not in downloaded_logs]
    print(len(game_ids))
    results = pool.map(downloader, game_ids)
Exemple #36
0
print('-' * 60)
print('Please wait, scanning remote host ', remote_server_ip)
print('-' * 60)


socket.setdefaulttimeout(0.5)


def scan_port(port):
    try:
        s = socket.socket(2, 1)
        res = s.connect_ex((remote_server_ip, port))
        if res == 0:  # 如果端口开启 发送 hello 获取banner
            print('Port {}: OPEN'.format(port))
        s.close()
    except Exception as e:
        print(str(e))


ports = [i for i in range(1, 1025)]
# Check what time the scan started
t1 = datetime.now()


pool = ThreadPool(processes=16)
results = pool.map(scan_port, ports)
pool.close()
pool.join()

print('Multiprocess Scanning Completed in  ', datetime.now() - t1)
        response = requests.get(
            BASE_URL + term,
            params={'key': API_KEY},
        )  # <6>
    except requests.HTTPError as err:
        print(err)
        return []
    else:
        data = response.json()  # <7>
        parts_of_speech = []
        for entry in data:  # <5>
            if isinstance(entry, dict):
                meta = entry.get("meta")
                if meta:
                    part_of_speech = entry.get("fl")
                    if part_of_speech:
                        parts_of_speech.append(part_of_speech)
        return sorted(set(parts_of_speech))  # <8>


p = Pool(POOL_SIZE)  # <9>

results = p.map(fetch_data, search_terms)  # <10>

for search_term, result in zip(search_terms, results):  # <11>
    print("{}:".format(search_term.upper()))
    if result:
        print(result)
    else:
        print("** no results **")
Exemple #38
0
def run_speedtest(args, conf):
    """Initializes all the data and threads needed to measure the relays.

    It launches or connect to Tor in a thread.
    It initializes the list of relays seen in the Tor network.
    It starts a thread to read the previous measurements and wait for new
    measurements to write them to the disk.
    It initializes a class that will be used to order the relays depending
    on their measurements age.
    It initializes the list of destinations that will be used for the
    measurements.
    It initializes the thread pool that will launch the measurement threads.
    The pool starts 3 other threads that are not the measurement (worker)
    threads.
    Finally, it calls the function that will manage the measurement threads.

    """
    global rd, pool, controller
    controller, _ = stem_utils.init_controller(
        path=conf.getpath('tor', 'control_socket'))
    if not controller:
        controller = stem_utils.launch_tor(conf)
    else:
        log.warning(
            'Is sbws already running? '
            'We found an existing Tor process at %s. We are not going to '
            'launch Tor, nor are we going to try to configure it to behave '
            'like we expect. This might work okay, but it also might not. '
            'If you experience problems, you should try letting sbws launch '
            'Tor for itself. The ability to use an already running Tor only '
            'exists for sbws developers. It is expected to be broken and may '
            'even lead to messed up results.',
            conf.getpath('tor', 'control_socket'))
        time.sleep(15)

    # When there will be a refactor where conf is global, this can be removed
    # from here.
    state = State(conf.getpath('paths', 'state_fname'))
    # XXX: tech-debt: create new function to obtain the controller and to
    # write the state, so that a unit test to check the state tor version can
    # be created
    # Store tor version whenever the scanner starts.
    state['tor_version'] = str(controller.get_version())
    # Call only once to initialize http_headers
    settings.init_http_headers(conf.get('scanner', 'nickname'), state['uuid'],
                               state['tor_version'])
    # To do not have to pass args and conf to RelayList, pass an extra
    # argument with the data_period
    measurements_period = conf.getint('general', 'data_period')
    rl = RelayList(args, conf, controller, measurements_period, state)
    cb = CB(args, conf, controller, rl)
    rd = ResultDump(args, conf)
    rp = RelayPrioritizer(args, conf, rl, rd)
    destinations, error_msg = DestinationList.from_config(
        conf, cb, rl, controller)
    if not destinations:
        fail_hard(error_msg)
    max_pending_results = conf.getint('scanner', 'measurement_threads')
    pool = Pool(max_pending_results)
    try:
        main_loop(args, conf, controller, rl, cb, rd, rp, destinations, pool)
    except KeyboardInterrupt:
        log.info("Interrupted by the user.")
        stop_threads(signal.SIGINT, None)
    # Any exception not catched at this point would make the scanner stall.
    # Log it and exit gracefully.
    except Exception as e:
        log.critical(FILLUP_TICKET_MSG)
        log.exception(e)
        stop_threads(signal.SIGTERM, None, 1)
Exemple #39
0
def main(artist_list, output_dir):
    start = time.time()
    global num_images
    print('gathering links to images...')

    # Create a threadpool with one less than the total number of CPUs of the machine
    # this allows fast multiprocessing w/out overloading the machine
    threadpool = Pool(multiprocessing.cpu_count() - 1)

    # Use imap to run the get_painting_list_by_artist function
    # different threads get different first inputs (numbers)
    # but typep and searchword are repeated
    wikiart_pages = threadpool.imap(get_painting_list_by_artist,
                                    zip(artist_list))

    # Close and join the threadpool as per recommended usage
    threadpool.close()
    threadpool.join()

    # Convert the wikiart iterator into pages and then items list
    pages = [page for page in wikiart_pages if page]
    items = [item for sublist in pages for item in sublist]
    num_images = len(items)

    # Create the output_dir
    if not os.path.isdir('%s/' % (output_dir)):
        os.mkdir('%s/' % (output_dir))

    threadpool = Pool(multiprocessing.cpu_count() - 1)

    # Download images
    failed_urls = []
    print('attempting to download %d images' % num_images)
    threadpool.starmap(
        downloader,
        zip(enumerate(items), itertools.repeat(output_dir),
            itertools.repeat(failed_urls)))

    threadpool.close()
    threadpool.join()

    print("Took %d seconds to complete the first run." % (time.time() - start))
    print("Attempting to gather the failed URLs")

    # Fix failed urls
    corrected_urls = []
    for i, url in enumerate(failed_urls):

        #Show progress
        if i % 100 == 0:
            print(str(i) + " out of " + str(len(failed_urls)))

        # Extract the correct part of the url
        url = url[1]
        extracted = re.findall(r'/images/(.*?).jpg', url)
        full_url = "https://www.wikiart.org/en/" + extracted[0]

        # Get the soup from the page
        soup = BeautifulSoup(urllib.request.urlopen(full_url), "lxml")

        # Try finding the correct link with .jpg
        regex = r'<meta content="https://uploads[0-9].wikiart.org(.*?).jpg'
        corrected_link = re.search(regex, str(soup.html()))

        success = False
        if corrected_link:
            # Get the url part of the corrected link (drop <meta content... etc.)
            full_correct_url = corrected_link.group(0)[15:]
            corrected_urls.append(full_correct_url)
            success = True
        # If jpg doesn't work
        else:
            # Try other file formats
            reg_list = [
                r'<meta content="https://uploads[0-9].wikiart.org(.*?).png',
                r'<meta content="https://uploads[0-9].wikiart.org(.*?).jpeg',
                r'<meta content="https://uploads[0-9].wikiart.org(.*?).Jpeg',
                r'<meta content="https://uploads[0-9].wikiart.org(.*?).JPG',
                r'<meta content="https://uploads[0-9].wikiart.org(.*?).PNG',
            ]
            for regex in reg_list:
                if success == False:
                    corrected_link = re.search(regex, str(soup.html()))
                    if corrected_link:
                        full_correct_url = corrected_link.group(0)[15:]
                        corrected_urls.append(full_correct_url)
                        success = True

        # If it fails again, move on
        if success == False:
            print("fail: " + full_url)

    print("Downloading corrected URLs")
    threadpool = Pool(multiprocessing.cpu_count() - 1)

    failed_urls_v2 = []
    threadpool.starmap(
        downloader,
        zip(enumerate(corrected_urls), itertools.repeat(output_dir),
            itertools.repeat(failed_urls_v2)))

    # close and join the threadpool as per recommended usage
    threadpool.close()
    threadpool.join()

    print("Took %d seconds to complete." % (time.time() - start))
                            conn = MySQLdb.connect(host='localhost', user='******', passwd='', port=3306, charset='utf8')
                            cur = conn.cursor()
                            conn.select_db('bili')
                            cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                                                [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration,
                                                 mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites])

                            print "Succeed: av" + str(av)
                        except MySQLdb.Error, e:
                            print "Mysql Error %d: %s" % (e.args[0], e.args[1])
                    else:
                        print "Error_Json: " + url
            else:
                print "Error_noCid:" + url
        else:
            print "Error_404: " + url


pool = ThreadPool(10)
# results = pool.map(spider, urls)
try:
    results = pool.map(spider, urls)
except Exception, e:
    # print 'ConnectionError'
    print e
    time.sleep(300)
    results = pool.map(spider, urls)

pool.close()
pool.join()
Exemple #41
0
def proxy(num):
    #	get_proxy(proxy_urls[1])
    pool = Pool(num)
    pool.map(get_proxy, proxy_urls)
        content = ''.join(con)
        #遍历字符串 保存为每行不好过50个字符
        for i in range(0,len(content),50):
            f.write(content[i:i+50] + "\n")
def get_content(html):
    '''
    :并行爬取保存数据
    '''
    urls = []
 
    for con in html:
        url = con['url']
        name = con['title']
        urls.append({'name':name,'url':url})
    # 线程个数
    pool = Pool(4)
    # 使用map进行并行爬取,save_content为爬取保存函数,
    # urls为一个list,里面存储的为网址列表和对应的保存名字
    pool.map(save_content,urls)
    pool.close()
    pool.join()
 
 
 
 
 
def main():
    selector = get_response(chapter_url)
 
    html = get_chapter_content(selector)
 
Exemple #43
0
def get_current_match_details(s, region, champion_id):
    matchlist = cass.get_match_history(summoner=s, champions=[champion_id], begin_index=0, end_index=10)
    len(matchlist) # to fill matchlist

    try:
        pool = Pool(10)
        pool.map(load_match, matchlist)
        pool.close()
        pool.join()
    except:
        pool.close()
        return HttpResponse(status=500)

    response = {}

    q = {}
    leagues = cass.get_league_positions(summoner=s, region=region)
    for league in leagues:
        q[league.queue.value] = {
            'tier': league.tier.value,
            'division': league.division.value,
            'points': league.league_points
        }
        if league.promos is not None:
            q[league.queue.value]['promos'] = league.promos.progress
            q[league.queue.value]['notPlayed'] = league.promos.not_played

    # summoner stats for past 20 matches on a champion
    stats = {
        "kills": 0,
        "deaths": 0,
        "assists": 0,
        "totalCs": 0,
        "cs10": 0,
        "cs20": 0,
        "cs30": 0,
        "gold10": 0,
        "gold20": 0,
        "gold30": 0,
        "wins": 0,
        "losses": 0
    }
    match_history = []

    games10 = 0
    games20 = 0
    games30 = 0
    cs10 = 0
    cs20 = 0
    cs30 = 0
    gold10 = 0
    gold20 = 0
    gold30 = 0
    match_count = 0
    for match in matchlist:
        if (match.region.value == region):
            match_count += 1

            participants = match.participants
            for participant in participants:
                if participant.summoner.id == s.id:
                    user = participant
                    break

            stats["kills"] += user.stats.kills
            stats["deaths"] += user.stats.deaths
            stats["assists"] += user.stats.assists
            stats["totalCs"] += user.stats.total_minions_killed

            if user.stats.win:
                stats["wins"] += 1
                match_history.append(1)
            else:
                stats["losses"] += 1
                match_history.append(0)

            dur = match.duration.seconds
            try:
                if dur > 10 * 60:
                    gold10 += user.timeline.gold_per_min_deltas['0-10'] * 10
                    cs10 += user.timeline.creeps_per_min_deltas['0-10'] * 10
                    games10 += 1
                if dur > 20 * 60:
                    gold20 += (user.timeline.gold_per_min_deltas['0-10'] + user.timeline.gold_per_min_deltas['10-20']) * 10
                    cs20 += (user.timeline.creeps_per_min_deltas['0-10'] + user.timeline.creeps_per_min_deltas['10-20']) * 10
                    games20 += 1
                if dur > 30 * 60:
                    gold30 += (user.timeline.gold_per_min_deltas['0-10'] + user.timeline.gold_per_min_deltas['10-20'] + user.timeline.gold_per_min_deltas['20-30']) * 10
                    cs30 += (user.timeline.creeps_per_min_deltas['0-10'] + user.timeline.creeps_per_min_deltas['10-20'] + user.timeline.creeps_per_min_deltas['20-30']) * 10
                    games30 += 1
            except:
                log.warn('user timeline data does not exist', match.id)

    stats["kills"] /= 10
    stats["deaths"] /= 10
    stats["assists"] /= 10
    stats["totalCs"] /= 10

    try:
        stats["cs10"] = round(cs10 / games10, 2)
        stats["cs20"] = round(cs20 / games20, 2)
        stats["cs30"] = round(cs30 / games30, 2)
        stats["gold10"] = round(gold10 / games10, 2)
        stats["gold20"] = round(gold20 / games20, 2)
        stats["gold30"] = round(gold30 / games30, 2)
    except:
        # divide by 0
        pass

    build = {}
    boots = {}
    core = {}
    situational = {}
    all_items = {}

    # get recommended build
    if match_count > 0:
        try:
            champ_items = ChampionItems.objects.get(champ_id=user.champion.id)
            items_blob = ujson.loads(champ_items.item_blob)

            blob_items = items_blob.items()
            for item, occurence in blob_items:
                if int(item) in Items.boots:
                    boots[item] = occurence
                elif int(item) in Items.full_items:
                    all_items[item] = occurence

            sorted_all = sorted(all_items, key=all_items.get, reverse=True) 
            core_arr = sorted_all[:3]
            situational_arr = sorted_all[3:8]

            for item in core_arr:
                core[item] = all_items[item]

            for item in situational_arr:
                situational[item] = all_items[item]
        except:
            pass

    build['boots'] = boots
    build['core'] = core
    build['situational'] = situational

    response['stats'] = stats
    response['build'] = build
    response['leagues'] = q

    return response
Exemple #44
0
                     parse_dates=True,
                     usecols=['Date', 'Close', '10_MAC', '100_MAC'])

    result = init_dict()

    for date in df.index:
        if gain_lose(df, date):
            update_result(df, date, result)

    result_x = sorted(result.items(), key=operator.itemgetter(1), reverse=True)
    result = []
    for row in result_x:
        if row[1] > THRESH:
            result.append(row)
    if len(result) > 0:
        writer = csv.writer(open(OUTPATH + symbol + '.csv', 'wb'))
        writer.writerows(result)
    return


if __name__ == '__main__':
    if os.path.exists('out'):
        shutil.rmtree('out')
    os.makedirs('out')

    files = os.listdir(PATH)
    pool = ThreadPool(4)
    pool.map(calculation, files)
    pool.close()
    pool.join()
Exemple #45
0
    def ping(self, targets=list(), filename=str(), status=str()):
        """
        Attempt to ping a list of hosts or networks (can be a single host)
        :param targets: List - Name(s) or IP(s) of the host(s).
        :param filename: String - name of the file containing hosts to ping
        :param status: String - if one of ['alive', 'dead', 'noip'] then only
        return results that have that status. If this is not specified,
        then all results will be returned.
        :return: Type and results depends on whether status is specified:
                 if status == '': return dict: {targets: results}
                 if status != '': return list: targets if targets == status
        """

        if targets and filename:
            raise SyntaxError("You must specify only one of either targets=[] "
                              "or filename=''.")
        elif not targets and not filename:
            raise SyntaxError("You must specify either a list of targets or "
                              "filename='', but not both.")
        elif filename:
            targets = self.read_file(filename)

        my_targets = {'hosts': [], 'nets': []}
        addresses = []

        # Check for valid networks and add hosts and nets to my_targets
        for target in targets:
            # Targets may include networks in the format "network mask", or,
            # a file could contain multiple hosts or IP's on a single line.
            if len(target.split()) > 1:
                target_items = target.split()
                for item in target_items:
                    try:
                        ip = IPAddress(item)
                        # If it is an IPv4 address or mask put in in addresses
                        if ip.version == 4:
                            addresses.append(str(ip))
                    except AddrFormatError:
                        # IP Address not detected, so assume it's a host name
                        my_targets['hosts'].append(item)
                    except ValueError:
                        # CIDR network detected
                        net = IPNetwork(item)
                        # Make sure it is a CIDR address acceptable to fping
                        if net.ip.is_unicast() and net.version == 4 and \
                                net.netmask.netmask_bits() in range(8, 31):
                            my_targets['nets'].append(target_items[0])
                        else:
                            msg = str(str(net) + ':Only IPv4 unicast addresses'
                                      ' with bit masks\n               '
                                      ' from 8 to 30 are supported.')
                            raise AttributeError(msg)
                # Iterate over the IP strings in addresses
                while len(addresses) > 1:
                    ip = IPAddress(addresses[0])
                    mask = IPAddress(addresses[1])
                    # Test to see if IP is unicast, and mask is an actual mask
                    if ip.is_unicast() and mask.is_netmask():
                        net = IPNetwork(str(ip) + '/' + str(
                            mask.netmask_bits()))
                        # Convert ip and mask to CIDR and remove from addresses
                        my_targets['nets'].append(str(net.cidr))
                        addresses.pop(0)
                        addresses.pop(0)
                    elif ip.is_unicast() and not ip.is_netmask():
                        # mask was not a mask so only remove IP and start over
                        my_targets['hosts'].append(str(ip))
                        addresses.pop(0)
                # There could be one more item in addresses, so check it
                if addresses:
                    ip = IPAddress(addresses[0])
                    if ip.is_unicast() and not ip.is_netmask():
                        my_targets['hosts'].append(addresses[0])
                        addresses.pop()
            # target has only one item, so check it
            else:
                try:
                    ip = IPAddress(target)
                    if ip.version == 4 and ip.is_unicast() and \
                            not ip.is_netmask():
                        my_targets['hosts'].append(target)
                    else:
                        msg = str(target + 'Only IPv4 unicast addresses are '
                                  'supported.')
                        raise AttributeError(msg)
                except AddrFormatError:
                    # IP Address not detected, so assume it's a host name
                    my_targets['hosts'].append(target)
                except ValueError:
                    # CIDR network detected
                    net = IPNetwork(target)
                    if net.ip.is_unicast() and net.version == 4 and \
                            net.netmask.netmask_bits() in range(8, 31):
                        my_targets['nets'].append(target)
                    else:
                        msg = str(str(net) + ':Only IPv4 unicast addresses'
                                  ' with bit masks\n               '
                                  ' from 8 to 30 are supported.')
                        raise AttributeError(msg)

        """
        Build the list of commands to run.
        """
        commands = []
        if len(my_targets['hosts']) != 0:
            for target in range(len(my_targets['hosts'])):
                commands.append([self.fping, '-nV', my_targets['hosts'][
                    target]])
        if len(my_targets['nets']) != 0:
            for target in range(len(my_targets['nets'])):
                commands.append([self.fping, '-ngV', my_targets['nets'][
                    target]])

        """
        Start pinging each item in my_targets and return the requested results
        when done.
        """
        pool = ThreadPool(self.num_pools)
        raw_results = pool.map(self.get_results, commands)
        pool.close()
        pool.join()
        self.results = {host: result for host, result in csv.reader(
            ''.join(raw_results).splitlines())}
        if not status:
            return self.results
        elif status == 'alive':
            return self.alive
        elif status == 'dead':
            return self.dead
        elif status == 'noip':
            return self.noip
        else:
            raise SyntaxError("Valid status options are 'alive', 'dead' or "
                              "'noip'")
Exemple #46
0
 def run(self):
     pool = ThreadPool(4)
     pool.map(self.retrieve_pic, self.process_multiple())
     pool.close()
     pool.join()
Exemple #47
0
# -*- coding: utf-8 -*-
import urllib2
import urllib 
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
 

def postHttp(i): 
	name                 = urllib.quote('我日') 
	url                  = 'https://hd.ysfaisco.cn/ajax/hdgame_h.jsp?cmd=setMbPlayerVotes&aid=11590276&gameId=1&openId='+i+'&style=49&name='+name+'&playerId=10604&otherPlayerId=6860'
	header               = {}
	header['User-Agent'] = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 MicroMessenger/5.0.1' 
	request              = urllib2.Request(url,headers=header)
	response             = urllib2.urlopen(request)
	result               = response.read()
	result               = result.strip()
	print result

if __name__ == '__main__':
	openid_list = ["oHRvawPnAW3RIxAElC8EKHtyCl1Y", "oHRvawA-pq-3dL3WkVAwOFM4y4Ss", "oHRvawLEo_hYamjiHGK_QFxr3vRU", "oHRvawBVAuoD1AqeK_ptmU9QqTU0", "oHRvawMGC7M97rHf-IS68odntdY8", "oHRvawE-wN6Ju7Qt9RWBOxWrogck", "oHRvawKHhGo8aNMRGd4HrF75x7bg", "oHRvawDQGSsGGjiCgvJCts0lXmQQ", "oHRvawG1jWXDaC5fNx4GOgroHepk", "oHRvawF-06-oX6yPOh65vWU0ORwM", "oHRvawByQkdkqcQdPjLT9TxXwXSc", "oHRvawDH1o82EfC_LFAt9W-l9olU", "oHRvawPfwBSC59WpDPlxIiGGVb6w", "oHRvawCPx4jbjrPi3jL2etQBMu7M", "oHRvawHgyW3DS8YNs44k0pCLspPk", "oHRvawDifKhMsHnWUiyrX6saAAdA", "oHRvawPyQbRCHNpQkFOn3_XLcc6E", "oHRvawNyKI-CSTeJP5o3G_46_Xww", "oHRvawOHjbl5xCJciA6fHNqmzQMo", "oHRvawEP6seYQw9C47gL6PHSFn0Q", "oHRvawI3summPXydFMLO46axcG-c", "oHRvawHhGSI-KvkagqOsztoxPfLc", "oHRvawCALKFm1_YgwpgrEaJ_Twq8", "oHRvawJwaHE__eV69PpS79-mAqEk", "oHRvawCRcdBEtG_0D6HHTUc6_X6k", "oHRvawOrvvdOjgVD9QSZ0trElJKM", "oHRvawPr0tPe7CTf2DXdaG_CDdv0", "oHRvawACRyrjIwC5THma5KBVhvA4", "oHRvawHsSLmvRslxC9PC6TX0FbPU", "oHRvawFNfDbRJM4DbIx4IDu6cngU", "oHRvawEX-eNOX44o7LT9zPMYJ9j0", "oHRvawPBnASXJnqY-2VVYpmnlMEU", "oHRvawKy-RW-8Q0sLcst5bY2kA_8", "oHRvawMaIy8Q4H0qBb7nP8Yy3olQ", "oHRvawI9btgFpo7S3Jqw31x-UeoQ", "oHRvawPLpqEFli7diR0QXCdbSR20", "oHRvawASK0KTR7venPHHsXbZJCJo", "oHRvawIT3HBMlY5ZH9oT2fjS7_e4", "oHRvawMaoaXJC5AlS_Rh59n9hlb8", "oHRvawODU7wOWORip_38sizUda_I", "oHRvawHfYM0eHTwieeVEOcoCgkXk", "oHRvawAqCBY5LBCL897-xVOGtEgw", "oHRvawHwu3ekeyrC9Y1GKixkNDqQ", "oHRvawJEjULKo711VQM7H_IG0Eo8", "oHRvawA5o_8Olo8tan5AEJv3SyDE", "oHRvawLHh_n-BTu9G-B0XOzi7eyA", "oHRvawFzkPfjbfX9P-sF04EDd4tc", "oHRvawAvcfeCNHHqi42cjkxwGQ_w", "oHRvawKGiht7E0ooJr1k1LowHesA", "oHRvawBEw0-33b8BJGWlzGQAbbGs", "oHRvawJhRz_VlsHK-SQzYcN05BXM", "oHRvawOPPgmNvjlxHkQ5RcIQRYts", "oHRvawFD2TRk1PAxIZ-mWlpjMKdM", "oHRvawMCd3Q_DSxGgbXYLeU76tTI", "oHRvawOxrhLb2O8Nav_Wu_i5B8g4", "oHRvawOLSfQKRn_zNowgdGD45mew", "oHRvawLgXcCrLQUj2qhuN-7EU4nA", "oHRvawPSsBuHNXiiff_R2jJHJ3xU", "oHRvawN0S5-9al2X70oBxjUjqRWY", "oHRvawKKFywMNfNURTfDcq2ec5wE", "oHRvawDAzdiZzZz5wYUSH27esKxU", "oHRvawCAVxf1u3i0rN62Webn4RKg", "oHRvawDW7htenhAyzVItYrZ6Idec", "oHRvawFZtr5qmR_-XXm2Fb8zkbIA", "oHRvawHCVPt-m9snFkzrQ4rQodh0", "oHRvawDYgh1_0snMXVKcaSYOqc6g", "oHRvawLXic5Qt3C6G-TuheoxturQ", "oHRvawLijvw8ILfQAw3utTy7ppOA", "oHRvawH2auViUvHdBRkL6D3bumpU", "oHRvawBLeZ0MGu2-uUUsAyElFCWE", "oHRvawPhr4x3Vp_A0DJ9k_XA9WV4", "oHRvawCqRp91-1_odIlEetc6m9L4", "oHRvawBsZffzjANt7JCR7NKB9hTM", "oHRvawAnKxvCxqLFl_Vjg5NqSxY0", "oHRvawPLEpkc-MoGFaSoEuVBNTyc", "oHRvawFHxKXFBOtKWIpiBf0qyXiA", "oHRvawLkC4KcghW1S950J2k6NDAQ", "oHRvawM9aahiJtRKkTMDvXb2d8sI", "oHRvawJ_1VBkCdXb8XlBdywOoDoU", "oHRvawITcJLg-pjkl1xRIZi-Iz_Q", "oHRvawDVhDGf2t50Qk-qh3Zd8Wgc"]  
	# postHttp()
	total = len(openid_list)
	page_pool = ThreadPool(total)  	 
	page_pool.map_async(postHttp, (openid_list))
	page_pool.close()
	page_pool.join() 
Exemple #48
0
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url).text

tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
url_list = []
for i in li_list:
    detail_url = "https://www.pearvideo.com/" + i.xpath('./div/a/@href')[0]
    name = i.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    detail_page = requests.get(url=detail_url).text
    ex = 'srcUrl="(.*?)",vdoUrl'
    video_url = re.findall(ex, detail_page)[0]
    dic = {'name': name, 'url': video_url}
    url_list.append(dic)


def get_video_data(d):
    url = d['url']
    data = requests.get(url=url).content
    print(d['name'], "正在下载。。。")
    with open(d['name'], 'wb') as f:
        f.write(data)
        print(d['name'], "下载成功。。。")


pool = Pool(4)
pool.map(get_video_data, url_list)
pool.close()
pool.join()
Exemple #49
0
    def execute_nodes(self, linker, Runner, manifest, node_dependency_list):
        adapter = get_adapter(self.config)

        num_threads = self.config.threads
        target_name = self.config.target_name

        text = "Concurrency: {} threads (target='{}')"
        concurrency_line = text.format(num_threads, target_name)
        dbt.ui.printer.print_timestamped_line(concurrency_line)
        dbt.ui.printer.print_timestamped_line("")

        schemas = list(Runner.get_model_schemas(manifest))
        node_runners = self.get_runners(Runner, adapter, node_dependency_list)

        pool = ThreadPool(num_threads)
        node_results = []
        for node_list in node_dependency_list:
            runners = self.get_relevant_runners(node_runners, node_list)

            args_list = []
            for runner in runners:
                args_list.append({'manifest': manifest, 'runner': runner})

            try:
                for result in pool.imap_unordered(self.call_runner, args_list):
                    is_ephemeral = Runner.is_ephemeral_model(result.node)
                    if not is_ephemeral:
                        node_results.append(result)

                    node = CompileResultNode(**result.node)
                    node_id = node.unique_id
                    manifest.nodes[node_id] = node

                    if result.errored:
                        dependents = self.get_dependent(linker, node_id)
                        self._mark_dependent_errors(node_runners, dependents,
                                                    result, is_ephemeral)

            except KeyboardInterrupt:
                pool.close()
                pool.terminate()

                adapter = get_adapter(self.config)

                if not adapter.is_cancelable():
                    msg = ("The {} adapter does not support query "
                           "cancellation. Some queries may still be "
                           "running!".format(adapter.type()))

                    yellow = dbt.ui.printer.COLOR_FG_YELLOW
                    dbt.ui.printer.print_timestamped_line(msg, yellow)
                    raise

                for conn_name in adapter.cancel_open_connections():
                    dbt.ui.printer.print_cancel_line(conn_name)

                dbt.ui.printer.print_run_end_messages(node_results,
                                                      early_exit=True)

                pool.join()
                raise

        pool.close()
        pool.join()

        return node_results
Exemple #50
0
# from multiprocessing import Pool
# import multiprocessing as mp
from multiprocessing.dummy import Pool  # 虛擬池化
import time
import os


def longTimeTask(i):
    print('task: {}, PID: {}'.format(i, os.getpid()))
    time.sleep(2)
    result = 10**30
    return result


if __name__ == '__main__':
    start_time = time.time()
    print('母程序PID', os.getpid())

    # 觀察PID
    p = Pool(4)
    # data is a list catch 每次遞迴的回傳值
    data = p.map(longTimeTask, iterable=[2, 4, 6,
                                         8])  # iterable=range(4) 也可用 list 進行疊代
    p.close()
    p.join()

    print(data)

    end_time = time.time()
    print('花了 {} 秒'.format(end_time - start_time))
Exemple #51
0
class RenderLocaleBatch(object):
    """Handles the rendering and threading of the controllers."""

    BATCH_DEFAULT_SIZE = 300  # Default number of documents in a batch.

    def __init__(self, jinja_env, profile, tick=None, batch_size=None):
        self.batch_size = batch_size or self.BATCH_DEFAULT_SIZE
        self.jinja_env = jinja_env
        self.profile = profile
        self.tick = tick
        self.batches = [[]]
        self._is_rendering = False
        self._results = None
        self._thread_pool = None

    def __len__(self):
        count = 0
        for batch in self.batches:
            count = count + len(batch)
        return count

    def _get_batch(self):
        # Ensure that batch is not over the max size.
        batch = self.batches[len(self.batches) - 1]
        if len(batch) >= self.batch_size:
            self.batches.append([])
            batch = self.batches[len(self.batches) - 1]
        return batch

    def add(self, controller, *args, **kwargs):
        """Add an item to be rendered to the batch."""
        batch = self._get_batch()

        batch.append({
            'controller': controller,
            'jinja_env': self.jinja_env,
            'args': args,
            'kwargs': kwargs,
        })

    def render_start(self):
        """Start the batches rendering."""
        self._thread_pool = ThreadPool(len(self.batches))
        self._results = self._thread_pool.imap_unordered(
            render_func, self.batches)
        self._is_rendering = True

    def render_finish(self):
        """Finish in progress batches rendering."""
        if not self._is_rendering:
            raise RenderNotStartedError('Rendering was never started')

        render_errors = []
        rendered_docs = []

        for batch_result in self._results:
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs
            if self.tick:
                for _ in batch_result.render_errors:
                    self.tick()
                for _ in batch_result.rendered_docs:
                    self.tick()
            for result in batch_result.rendered_docs:
                self.profile.add_timer(result.render_timer)

        self._thread_pool.close()
        self._thread_pool.join()
        self._is_rendering = False

        return rendered_docs, render_errors

    def render_sync(self):
        """Syncronous rendering for non-threaded rendering."""
        render_errors = []
        rendered_docs = []

        for batch in self.batches:
            batch_result = render_func(batch, tick=self.tick)
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs

        return rendered_docs, render_errors
Exemple #52
0
def bc2pg(
    dataset,
    db_url,
    table,
    schema,
    query,
    bounds,
    bounds_crs,
    pagesize,
    max_workers,
    dim,
    fid,
    append,
    promote_to_multi,
    no_timestamp,
    verbose,
    quiet,
):
    """Download a DataBC WFS layer to postgres - an ogr2ogr wrapper.

     \b
      $ bcdata bc2pg bc-airports --db_url postgresql://postgres:postgres@localhost:5432/postgis

    The default target database can be specified by setting the $DATABASE_URL
    environment variable.
    https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls
    """

    # for this command, default to INFO level logging
    # (echo the ogr2ogr commands by default)
    verbosity = verbose - quiet
    log_level = max(10, 20 - 10 * verbosity)
    logging.basicConfig(stream=sys.stderr, level=log_level)
    log = logging.getLogger(__name__)
    src = bcdata.validate_name(dataset)
    src_schema, src_table = [i.lower() for i in src.split(".")]
    if not schema:
        schema = src_schema
    if not table:
        table = src_table
    # always upper
    if fid:
        fid = fid.upper()
    # create schema if it does not exist
    conn = pgdata.connect(db_url)
    if schema not in conn.schemas:
        click.echo("Schema {} does not exist, creating it".format(schema))
        conn.create_schema(schema)

    # if table does not exist already, remove the -append flag
    if schema + "." + table not in conn.tables and append:
        append = False
        click.echo("Table does not exist, creating")

    # build parameters for each required request
    param_dicts = bcdata.define_request(
        dataset,
        query=query,
        sortby=fid,
        pagesize=pagesize,
        bounds=bounds,
        bounds_crs=bounds_crs,
    )

    # run the first request / load
    payload = urlencode(param_dicts[0], doseq=True)
    url = bcdata.WFS_URL + "?" + payload
    db = parse_db_url(db_url)
    db_string = "PG:host={h} user={u} dbname={db} port={port}".format(
        h=db["host"],
        u=db["user"],
        db=db["database"],
        port=db["port"],
    )
    if db["password"]:
        db_string = db_string + " password={pwd}".format(pwd=db["password"])
    # create the table
    command = [
        "ogr2ogr",
        "-f",
        "PostgreSQL",
        db_string,
        "-t_srs",
        "EPSG:3005",
        "-nln",
        schema + "." + table,
        url,
    ]
    if append:
        command = command + ["-append"]
    else:
        command = command + ["-overwrite", "-lco", "GEOMETRY_NAME=geom"]
    if dim:
        command = command + ["-dim", dim]
    # for speed with big loads - unlogged, no spatial index
    if not append:
        command = command + ["-lco", "UNLOGGED=ON"]
        command = command + ["-lco", "SPATIAL_INDEX=NONE"]
    if promote_to_multi:
        command = command + ["-nlt", "PROMOTE_TO_MULTI"]
    log.info(" ".join(command))
    subprocess.run(command)

    # write to additional separate tables if data is larger than 10k recs
    if len(param_dicts) > 1:
        commands = []
        for n, paramdict in enumerate(param_dicts[1:]):
            # create table to load to (so types are identical)
            sql = """
            CREATE TABLE {schema}.{table}_{n}
            (LIKE {schema}.{table}
            INCLUDING ALL)
            """.format(schema=schema, table=table, n=str(n))
            conn.execute(sql)
            payload = urlencode(paramdict, doseq=True)
            url = bcdata.WFS_URL + "?" + payload
            command = [
                "ogr2ogr",
                "-update",
                "-append",
                "-f",
                "PostgreSQL",
                db_string + " active_schema=" + schema,
                "-t_srs",
                "EPSG:3005",
                "-nln",
                table + "_" + str(n),
                url,
            ]
            if dim:
                command = command + ["-dim", dim]
            if promote_to_multi:
                command = command + ["-nlt", "PROMOTE_TO_MULTI"]
            commands.append(command)
        # log all requests, not just the first one
        for c in commands:
            log.info(c)
        # https://stackoverflow.com/questions/14533458
        pool = Pool(max_workers)
        with click.progressbar(pool.imap(partial(call), commands),
                               length=len(param_dicts)) as bar:
            for returncode in bar:
                if returncode != 0:
                    click.echo("Command failed: {}".format(returncode))

        # once loaded, combine & drop
        for n, _x in enumerate(param_dicts[1:]):
            sql = """INSERT INTO {schema}.{table} SELECT * FROM {schema}.{table}_{n}""".format(
                schema=schema, table=table, n=str(n))
            conn.execute(sql)
            sql = "DROP TABLE {}.{}_{}".format(schema, table, n)
            conn.execute(sql)

    # Deal with primary key
    # First, drop ogc_fid - becaue we load to many tables, it is not unique
    sql = "ALTER TABLE {}.{} DROP COLUMN ogc_fid CASCADE".format(schema, table)
    conn.execute(sql)

    # if provided with a fid to use as pk, assign it
    if fid:
        sql = "ALTER TABLE {}.{} ADD PRIMARY KEY ({})".format(
            schema, table, fid)
        conn.execute(sql)
        # make fid auto-increment in case we want to add records
        sql = """
            CREATE SEQUENCE {schema}.{table}_{fid}_seq
            OWNED BY {schema}.{table}.{fid};

            SELECT
              setval('{schema}.{table}_{fid}_seq',
              coalesce(max({fid}), 0) + 1, false)
            FROM {schema}.{table};

            ALTER TABLE {schema}.{table}
            ALTER COLUMN {fid}
            SET DEFAULT nextval('{schema}.{table}_{fid}_seq');
        """.format(schema=schema, table=table, fid=fid)
        conn.execute(sql)
    # otherwise, create a new serial ogc_fid
    else:
        sql = "ALTER TABLE {}.{} ADD COLUMN ogc_fid SERIAL PRIMARY KEY".format(
            schema, table)
        conn.execute(sql)

    if not append:
        conn.execute("ALTER TABLE {}.{} SET LOGGED".format(schema, table))
        log.info("Indexing geometry")
        conn.execute("CREATE INDEX ON {}.{} USING GIST (geom)".format(
            schema, table))

    # once complete, note date/time of completion in public.bcdata
    if not no_timestamp:
        conn.execute(
            "CREATE TABLE IF NOT EXISTS public.bcdata (table_name text PRIMARY KEY, date_downloaded timestamp WITH TIME ZONE);"
        )
        conn.execute(
            """INSERT INTO public.bcdata (table_name, date_downloaded)
                        SELECT %s as table_name, NOW() as date_downloaded
                        ON CONFLICT (table_name) DO UPDATE SET date_downloaded = NOW();
                     """, (schema + '.' + table, ))

    log.info("Load of {} to {} in {} complete".format(src,
                                                      schema + "." + table,
                                                      db_url))
Exemple #53
0
def mergeFilesByRegion(filesByRegion, grid, outputDir):
    # Merge a set of files by region into the specified dir
    # Key is up/down/nominal etc
    N = 0
    filesToWrite = {}
    for r in filesByRegion:
        for key in filesByRegion[r]:
            if filesByRegion[r][key] == []:
                if key == "Nominal":
                    print("WARNING: no input files for region {0} key {1}".
                          format(r, key))
                continue

            filePrefix = "%s_%s" % (r, grid)
            filename = os.path.join(outputDir, "%s.root" % (filePrefix))
            if os.path.exists(filename):
                print("Output file {0} exists - skipping".format(
                    os.path.basename(filename)))
                continue

            filesToWrite[filename] = {
                "region": r,
                "files": filesByRegion[r][key]
            }
            N += 1

    # Got anything?
    if filesToWrite == {}:
        return

    # build the pool arguments
    args = []
    for filename in filesToWrite:
        N -= 1
        args.append((
            filename,
            filesToWrite[filename]['files'],
            False,
            filesToWrite[filename]['region'],
            N,
        ))

    pool = ThreadPool(8)
    try:
        #results = pool.map(mergeFiles, args)
        results = pool.imap_unordered(mergeFiles, args)
        pool.close()
        pool.join()
    except KeyboardInterrupt:
        print "Caught KeyboardInterrupt, terminating workers"
        pool.terminate()
        pool.join()

    return

    # Below is to be removed legacy code relying on hadd

    for r in filesByRegion:
        for key in filesByRegion[r]:
            if filesByRegion[r][key] == []: continue

            N -= 1

            # Merge the files in chunks of 50, and then merge these chunks

            # The whole idea behind this exercise is to avoid exceeding the maximum length of
            # of a command allowed in bash.

            filePrefix = "%s_%s" % (r, grid)
            filename = os.path.join(outputDir, "%s.root" % (filePrefix))
            if os.path.exists(filename):
                print("Output file {0} exists - skipping".format(
                    os.path.basename(filename)))
                continue

            mergeFiles(filename, filesByRegion[r][key])

            #fileMerger = ROOT.TFileMerger()
            #fileMerger.OutputFile(filename)
            #for f in filesByRegion[r][key]:
            #    fileMerger.AddFile(f)
            #fileMerger.Merge()

            #i=1
            #print("Attempting to make file {0}".format(filename))
            #for subset in chunks(filesByRegion[r][key], 50):
            #    print("Merging subset {0:d}...".format(i))
            #    filename = os.path.join(outputDir, "%s_%03d.root" % (filePrefix, i) )
            #    outputFiles.append(filename)
            #
            #    if len(subset) == 1:
            #        shutil.copy(subset[0], filename)
            #    else:
            #        cmd = "hadd -f %s %s" % (filename, " ".join(subset))
            #        subprocess.call(cmd, shell=True)
            #
            #    i+=1

            #print("Merging all subsets")
            #filename = os.path.join(outputDir, "%s.root" % (filePrefix) )

            #if len(outputFiles) == 1:
            #    # only 1 file, so just rename it
            #    os.rename(outputFiles[0], filename)
            #else:
            #    cmd = "hadd -f %s %s" % (filename, " ".join(outputFiles))
            #    subprocess.call(cmd, shell=True)

            #print("Done merging subsets; removing temporary files")
            #for f in outputFiles:
            #    if not os.path.exists(f): continue
            #    os.remove(f)

            print("=> Created file for {0}; {1} files remaining".format(r, N))
Exemple #54
0
 def render_start(self):
     """Start the batches rendering."""
     self._thread_pool = ThreadPool(len(self.batches))
     self._results = self._thread_pool.imap_unordered(
         render_func, self.batches)
     self._is_rendering = True
Exemple #55
0
def main():
    expected_arg = "[A valid PAGE_URL or IMAGE_URL]"
    num_args = len(sys.argv)
    if num_args < 2 or num_args > 2:
        print("\n* INVALID RUN COMMAND! *  Usage:")
        print("classify %s\n" % expected_arg)
    elif num_args == 2:
        url = sys.argv[1]
        valid_url = web_core.is_valid_url(url)
        if not valid_url:
            file_path = url
            if isfile(file_path):
                best_guess = image_base.classify_local_image(file_path)
            elif isdir(file_path):
                best_guess = image_base.classify_folder_images(
                    file_path, return_dict=True)
            else:
                raise Exception("Error: %s is not a valid image path!" % url)
            print("\n*** Best match classification: ***")
            print(best_guess)
            print("")
            return
        content_type = web_core.get_content_type(url)
        if content_type == 'other':
            raise Exception(
                "Error: %s does not evaluate to %s" % (url, expected_arg))
        elif content_type == 'image':
            best_guess = image_base.classify_image_url(url)
            print("\n*** Best match classification: ***")
            print(best_guess)
            print("")
        elif content_type == 'html':
            global images_classified
            image_list = image_base.get_all_images_on_page(url)

            if 'linux2' not in sys.platform and settings.MAX_THREADS > 1:
                # Multi-threading the work when not using Docker Linux
                pool = ThreadPool(settings.MAX_THREADS)
                pool.map(download_and_classify_image, image_list)
                pool.close()
                pool.join()
            else:
                # Single-threading the image classification work
                min_w_h = settings.MIN_W_H  # Minimum size for classification
                for image in image_list:
                    web_core.save_file_as(image, "temp_image.png")
                    image_base.convert_image_file_to_jpg(
                        "downloads_folder/temp_image.png")
                    width, height = image_base.get_image_file_dimensions(
                        "downloads_folder/temp_image.jpg")
                    if width >= min_w_h and height >= min_w_h:
                        best_guess = classify_image.external_run(
                            "downloads_folder/temp_image.jpg")
                        if images_classified == 0:
                            print("\n*** "
                                  "Best match classifications for page images:"
                                  " ***")
                        images_classified += 1
                        print(best_guess)
                        if images_classified >= settings.MAX_IMAGES_PER_PAGE:
                            break

            if images_classified >= settings.MAX_IMAGES_PER_PAGE:
                print("\n(NOTE: Exceeded page classification limit "
                      "of %d images per URL! Stopping early.)" % (
                        settings.MAX_IMAGES_PER_PAGE))

            if images_classified == 0:
                print("\nCould not find images to classify on the page! "
                      "(Min size = %dx%d pixels)" % (
                        settings.MIN_W_H, settings.MIN_W_H))
            print("")
        else:
            raise Exception(
                "Unexpected content type %s. Fix the code!" % content_type)
Exemple #56
0
                    self.urls.append(make_url(self.domain, url))


urls = []


def test(url):

    print("URLS : ", len(urls))
    ax = Spider(url)
    urls.extend(ax.urls)
    return url


if __name__ == '__main__':
    content = ""
    url = input('Enter the url : ')
    urls.append(url)
    spider = Spider(url)

    print("Will visit :" + str(len(spider.urls)) + " urls")
    pool = ThreadPool(10)
    results = pool.map(test, spider.urls)
    """for i in spider.urls:
        print("URLS : ", len(urls))
        ax = Spider(i)
        urls.extend(ax.urls)
    print(len(urls))"""
    #spider.find_urls(content)
    #print(content)
Exemple #57
0
authwarpper = partial(auth_account, )


def spider_kaiyuan(start, end, password):
    file_name = '账号{0}-{1}.txt'.format(start, end)
    with open(file_name, 'a+') as f:
        for account in range(int(start), int(end)):
            if (auth_account(account=account, password=password)):
                f.write('账号:{} 密码{}'.format(account, password))
                f.flush()


def generate_func_args(start, end, password):
    return (start, end, password)


if __name__ == '__main__':

    pool = ThreadPool(5)
    args = []
    account_start = 888800000000
    password = input("输入测试密码 测试区间在{} -{}\n".format(
        account_start + 5 * 10000, account_start + 9 * 10000 + 9999))
    for i in range(5, 9):
        args_map = generate_func_args(account_start + i * 10000,
                                      account_start + i * 10000 + 9999,
                                      password)
        args.append(args_map)

    pool.starmap(spider_kaiyuan, args)
Exemple #58
0
def trigger_tasks(tasks, thread_count):
    pool = ThreadPool(int(thread_count))
    pool.map(Task.run_tests, tasks)
Exemple #59
0
def paths_to_bids(path_to_dataset, path_to_csv, bids_dir, modality):
    """
         This method converts all the T1 images found in the AIBL dataset
         downloaded in BIDS

         :param path_to_dataset: path_to_dataset
         :param path_to_csv: path to the csv file containing clinical data
         :param bids_dir: path to save the AIBL-T1-dataset converted in a
         BIDS format
         :param modality: string 't1', 'av45', 'flute' or 'pib'

         :return: list of all the images that are potentially converted in a
         BIDS format and saved in the bids_dir. This does not guarantee
         existence
     """
    from os.path import join, exists
    from numpy import nan
    import pandas as pds
    from clinica.utils.stream import cprint
    from multiprocessing.dummy import Pool
    from multiprocessing import cpu_count, Value
    import glob

    if modality.lower() not in ['t1', 'av45', 'flute', 'pib']:
        # This should never be reached
        raise RuntimeError(modality.lower() +
                           ' is not supported for conversion')

    counter = None

    def init(args):
        """ store the counter for later use """
        global counter
        counter = args

    def create_file(image):
        global counter
        subject = image.Subjects_ID
        session = image.Session_ID
        name_of_path = {
            't1': 'Path_to_T1',
            'av45': 'Path_to_pet',
            'flute': 'Path_to_pet',
            'pib': 'Path_to_pet'
        }
        # depending on the dataframe, there is different way of accessing
        # the iage object
        image_path = image[name_of_path[modality]]
        with counter.get_lock():
            counter.value += 1
        if image_path is nan:
            cprint('No path specified for ' + subject + ' in session ' +
                   session)
            return nan
        cprint('[' + modality.upper() + '] Processing subject ' +
               str(subject) + ' - session ' + session + ', ' +
               str(counter.value) + ' / ' + str(total))
        session = viscode_to_session(session)
        # creation of the path
        if modality == 't1':
            output_path = join(bids_dir, 'sub-AIBL' + subject,
                               'ses-' + session, 'anat')
            output_filename = 'sub-AIBL' + subject + '_ses-' + session + '_T1w'
        elif modality in ['flute', 'pib', 'av45']:
            output_path = join(bids_dir, 'sub-AIBL' + subject,
                               'ses-' + session, 'pet')
            output_filename = 'sub-AIBL' + subject + '_ses-' + session \
                              + '_task-rest_acq-' + modality + '_pet'
        # image is saved following BIDS specifications

        if exists(join(output_path, output_filename + '.nii.gz')):
            cprint('Subject ' + str(subject) + ' - session ' + session +
                   ' already processed.')
            output_image = join(output_path, output_filename + '.nii.gz')
        else:
            output_image = dicom_to_nii(subject, output_path, output_filename,
                                        image_path)
        return output_image

    # it reads the dataframe where subject_ID, session_ID and path are saved
    if modality == 't1':
        images = find_path_to_T1(path_to_dataset, path_to_csv)
    else:
        path_to_csv_pet_modality = glob.glob(
            join(path_to_csv, 'aibl_' + modality + 'meta_*.csv'))[0]
        if not exists(path_to_csv_pet_modality):
            raise FileNotFoundError(path_to_csv_pet_modality +
                                    ' file not found in clinical data folder')
        # Latest version of Flutemetamol CSV file (aibl_flutemeta_01-Jun-2018.csv)
        # has an extra column for some rows. However, each CSV file (regarding PET tracers)
        # contains the same columns. The usecols fixes this issue.
        df_pet = pds.read_csv(path_to_csv_pet_modality,
                              sep=',|;',
                              usecols=list(range(0, 36)))
        images = find_path_to_pet_modality(path_to_dataset, df_pet)
    images.to_csv(join(bids_dir, modality + '_paths_aibl.tsv'),
                  index=False,
                  sep='\t',
                  encoding='utf-8')

    counter = Value('i', 0)
    total = images.shape[0]
    # Reshape inputs to give it as a list to the workers
    images_list = []
    for i in range(total):
        images_list.append(images.iloc[i])

    # intializer are used with the counter variable to keep track of how many
    # files have been processed
    poolrunner = Pool(cpu_count(), initializer=init, initargs=(counter, ))
    output_file_treated = poolrunner.map(create_file, images_list)
    del counter
    return output_file_treated
Exemple #60
0
 def initiate_threads():
     _pool = Pool(5)
     _pool.map(traverse_directory, self.valid_directories)
     _pool.close()
     _pool.join()