Beispiel #1
0
def generate(config, dnat=False, test=True):
    public_ip = config["public_ip"]
    current_ip = config["base_ip"]
    dnsmasq_content = ""
    for group in config["groups"].values():
        if not dnat:
            c = chunks([proxy["domain"] for proxy in group["proxies"]], 5)
        else:
            c = chunks([proxy["domain"] for proxy in group["proxies"] if proxy["dnat"]], 5)

        for chunk in c:
            if not dnat:
                dnsmasq_content += generate_dns(chunk, public_ip)
            else:
                dnsmasq_content += generate_dns(chunk, current_ip)

    if test:
        if not dnat:
            dnsmasq_content += generate_dns('ptest.verdandi.is', public_ip)
            dnsmasq_content += generate_dns('ptest2.verdandi.is', public_ip)
        else:
            dnsmasq_content += generate_dns('ptest.verdandi.is', current_ip)
            dnsmasq_content += generate_dns('ptest2.verdandi.is', current_ip)

    if dnat:
        for group in config["groups"].values():
            for proxy in group["proxies"]:
                if not proxy["dnat"]:
                    current_ip = long2ip(ip2long(current_ip) + 1)
                    dnsmasq_content += generate_dns(proxy["domain"], current_ip)

    return dnsmasq_content
Beispiel #2
0
    def train(self, X_train, X_val):

        train_true = filter(lambda x: x[2]==1, X_train)
        train_false = filter(lambda x: x[2]==0, X_train)

        val_true = filter(lambda x: x[2]==1, X_val)
        val_false = filter(lambda x: x[2]==0, X_val)

        n_train_true = len(train_true)
        n_val_true = len(val_true)

        make_epoch_helper = functools.partial(make_epoch, train_true=train_true, train_false=train_false, val_true=val_true, val_false=val_false)

        logging.info("Starting training...")
        epoch_iterator = ParallelBatchIterator(make_epoch_helper, range(P.N_EPOCHS), ordered=False, batch_size=1, multiprocess=False, n_producers=1)

        for epoch_values in epoch_iterator:
            self.pre_epoch()
            train_epoch_data, val_epoch_data = epoch_values

            train_epoch_data = util.chunks(train_epoch_data, P.BATCH_SIZE_TRAIN)
            val_epoch_data = util.chunks(val_epoch_data, P.BATCH_SIZE_VALIDATION)

            self.do_batches(self.train_fn, train_epoch_data, self.train_metrics)
            self.do_batches(self.val_fn, val_epoch_data, self.val_metrics)

            self.post_epoch()
            logging.info("Setting learning rate to {}".format(P.LEARNING_RATE  * ((0.985)**self.epoch)))
            self.l_r.set_value(P.LEARNING_RATE  * ((0.985)**self.epoch))
 def __call__(self, message, state=None, *, pad=True):
     state = state or self.initial_state
     prepared_message = message + (self.padding(len(message)) if pad else b"")
     assert len(prepared_message) % self.block_size == 0
     for block in chunks(prepared_message, self.block_size):
         state = self.compress(state, block)
     return state
Beispiel #4
0
 def getstatusforfids(self, fids):
     status = {}
     
     for chunk in chunks(fids, 50):
         for f in arlalow.fetchbulkstatus(self.fsconn, chunk):
             status[f["fid"]] = f["status"]
     return status
Beispiel #5
0
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000):
    print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath)
    all_labels = []
    label_dict = {}

    filenames_chunks = util.chunks(filenames, chunk_size)

    for i, chunk in enumerate(filenames_chunks):
        pool = Pool(processes=util.CPU_COUNT)
        chunk_labels = pool.map(extract_labels, chunk)
        pool.close()

        for filepath, labels in zip(chunk, chunk_labels):
            if labels is not None:
                file_id = util.filename_without_extension(filepath)
                label_dict[file_id] = labels
                all_labels += labels

        print i+1, '/', len(filenames_chunks)

    #Write labels to file
    with open(out_filepath,'w') as f:
        pickle.dump(label_dict, f)

    print '\nLabels:'
    print len(set(all_labels))
    print Counter(all_labels)
    def predict(self, data, modes):
        """predict whether a list of position follows atrain route by detecting
        the nearest train stops. Input is the pandas data frame of
        measurements and an array of current mode predictions.  Returns
        an array of predicted modes of the same size as the input data
        frame has rows.

        """
        # extract lat/lon from data frame
        lat = data['WLATITUDE'].values
        lon = data['WLONGITUDE'].values

        # chunk is a tuple (start_idx, end_idx, mode)
        for start_idx, end_idx, _ in ifilter(lambda chunk: chunk[2] in [MODE_CAR, MODE_BUS, MODE_TRAIN],
                                             chunks(modes, include_values=True)):
            # test for distance first
            lat_seg = lat[start_idx:end_idx]
            lon_seg = lon[start_idx:end_idx]
            valid_lat_seg = lat_seg[np.where(np.invert(np.isnan(lat_seg)))[0]]
            valid_lon_seg = lon_seg[np.where(np.invert(np.isnan(lon_seg)))[0]]

            if len(valid_lon_seg) == 0:
                continue
            # TODO: parameters have to be tuned carefully
            is_train = predict_mode_by_location(valid_lat_seg,
                                                valid_lon_seg,
                                                self.train_location_tree,
                                                self.train_location_dict,
                                                self.train_route_dict,
                                                dist_thre = 400,
                                                dist_pass_thres = 7, 
                                                num_stops_thre = 3,
                                                dist_pass_thres_perc = 0.7)

            #check entry point distance
            entry_pt_near = -1
            exit_pt_near = -1

            if start_idx-1>=0:
                if not np.isnan(lat[start_idx-1]):
                    nearest_station = find_nearest_station(lat[start_idx-1], lon[start_idx-1], self.train_location_tree, self.dist_thres_entry_exit)
                    if len(nearest_station)!=0:
                        entry_pt_near = 1
                    else:
                        entry_pt_near = 0

            if end_idx < len(modes):
                if not np.isnan(lat[end_idx]):
                    nearest_station = find_nearest_station(lat[end_idx],lon[end_idx],
                                                           self.train_location_tree,
                                                           self.dist_thres_entry_exit)
                    if len(nearest_station)!=0:
                        exit_pt_near = 1
                    else:
                        exit_pt_near = 0
            if is_train or entry_pt_near + exit_pt_near == 2:
                modes[start_idx:end_idx] = MODE_TRAIN
            else:
                modes[start_idx:end_idx] = MODE_CAR
        return modes
Beispiel #7
0
def collect_tweets_by_ids(tweet_ids_config_filepath, output_folder, config):

    apikeys = list(config['apikeys'].values()).pop()

    tweet_ids_config = {}
    with open(os.path.abspath(tweet_ids_config_filepath), 'r') as tweet_ids_config_rf:
        tweet_ids_config = json.load(tweet_ids_config_rf)

    max_range = 100
    
    current_ix = tweet_ids_config['current_ix'] if ('current_ix' in tweet_ids_config) else 0
    total = len(tweet_ids_config['tweet_ids'][current_ix:])
    tweet_id_chuncks = util.chunks(tweet_ids_config['tweet_ids'][current_ix:], max_range)

    for tweet_ids in tweet_id_chuncks:
        try:
            twitterCralwer = TwitterCrawler(apikeys=apikeys, client_args=CLIENT_ARGS, output_folder = output_folder)
            twitterCralwer.lookup_tweets_by_ids(tweet_ids)
            current_ix += len(tweet_ids)

        except Exception as exc:
            logger.error(exc)
            logger.error(util.full_stack()) #don't care, if Ctrl+c is hit, does not handle it.  When you restart, it restarts from the last chunk (too much trouble to handle Ctrl + c).
            # you will get duplicate tweets, so what...
            pass

        tweet_ids_config['current_ix'] = current_ix
        
        flash_cmd_config(tweet_ids_config, tweet_ids_config_filepath, output_folder)

        logger.info('COMPLETED -> (current_ix: [%d/%d])'%(current_ix, total))
        logger.info('PAUSE %ds to CONTINUE...'%WAIT_TIME)
        time.sleep(WAIT_TIME)
    else:
        logger.info('[tweets_by_ids] ALL COMPLETED')
Beispiel #8
0
	def decode(self, server, block_header, target, job_id = None, extranonce2 = None):
		if block_header:
			job = Object()
	
			binary_data = block_header.decode('hex')
			data0 = np.zeros(64, np.uint32)
			data0 = np.insert(data0, [0] * 16, unpack('IIIIIIIIIIIIIIII', binary_data[:64]))
	
			job.target	  = np.array(unpack('IIIIIIII', target.decode('hex')), dtype=np.uint32)
			job.header	  = binary_data[:68]
			job.merkle_end  = np.uint32(unpack('I', binary_data[64:68])[0])
			job.time		= np.uint32(unpack('I', binary_data[68:72])[0])
			job.difficulty  = np.uint32(unpack('I', binary_data[72:76])[0])
			job.state	   = sha256(STATE, data0)
			job.f		   = np.zeros(8, np.uint32)
			job.state2	  = partial(job.state, job.merkle_end, job.time, job.difficulty, job.f)
			job.targetQ	 = 2**256 / int(''.join(list(chunks(target, 2))[::-1]), 16)
			job.job_id	  = job_id
			job.extranonce2 = extranonce2
			job.server	  = server
	
			calculateF(job.state, job.merkle_end, job.time, job.difficulty, job.f, job.state2)

			if job.difficulty != self.difficulty:
				self.set_difficulty(job.difficulty)
	
			return job
Beispiel #9
0
    def decode(self, server, block_header, target, job_id = None, extranonce2 = None):
        if block_header:
            job = Object()

            binary_data = block_header.decode('hex')

            #data0 = list(unpack('<16I', binary_data[:64])) + ([0] * 48)

            job.headerX = binary_data[:76]
            job.dataX = unpack('<19I', job.headerX)
            job.target		= unpack('<8I', target.decode('hex'))
            job.header		= binary_data[:68]
            job.merkle_end	= uint32(unpack('<I', binary_data[64:68])[0])
            job.time		= uint32(unpack('<I', binary_data[68:72])[0])
            job.difficulty	= uint32(unpack('<I', binary_data[72:76])[0])
            # job.state		= sha256(STATE, data0)
            job.targetQ		= 2**256 / int(''.join(list(chunks(target, 2))[::-1]), 16)
            job.job_id		= job_id
            job.extranonce2	= extranonce2
            job.server		= server

            if job.difficulty != self.difficulty:
                self.set_difficulty(job.difficulty)

            return job
def crack_ecb_oracle(oracle_fn, prefix_length=0):
    block_size = guess_block_size(oracle_fn)
    if not looks_like_ecb(oracle_fn(b"A" * 100), block_size):
        raise ValueError("oracle_fn does not appear to produce ECB mode output")
    result = bytearray()
    while True:
        short_block_length = (block_size - len(result) - 1 - prefix_length) % block_size
        short_input_block = b"A" * short_block_length
        block_index = (len(result) + prefix_length) // block_size
        block_to_look_for = chunks(oracle_fn(short_input_block))[block_index]
        for guess in all_bytes_by_frequency:
            test_input = short_input_block + result + bytes([guess])
            if chunks(oracle_fn(test_input))[block_index] == block_to_look_for:
                result.append(guess)
                break
        else:  # if no byte matches
            return pkcs7_unpad(result)
Beispiel #11
0
 def add_text(self, text):
     if len(text) + len(self._lines[self.point[0]]) > self.draw_width:
         self.point_to_next_line()
     if len(text) > self.draw_width:
         lines_to_add = chunks(text, self.draw_width)
         lines_to_advance = len(lines_to_add)
         for line in lines_to_add:
             self._lines.append(line)
         self.adjust_point_by_lines(lines_to_advance)
     else:
         self._lines[self.point[0]] += text
     self.point_to_end_of_line()
def cluster_to_kml(user, cluster, cluster_id):
    """
    Creates a single, or possibly multiple KML files a given cluster.
    A KML file is limited by MyMaps to having only 10 layers, so only 
    10 sections will be in a given KML file.

    Responsibilty of caller to check existence and formatting of cluster 
    """ 
    Sections = get_section_db()
    for i,chunk in enumerate(chunks(cluster,10)):    
        sections = map(lambda section_id: Sections.find_one({'_id':section_id}), chunk)
        sections_to_kml("%s_cluster_data_kml/CLUSTER_%s_%i" % (user, str(cluster_id), i), sections)
Beispiel #13
0
 def cross_validation(self, fold, epoch):
     print 'doing cross validation...'
     splited_data = list(chunks(self.data, fold))
     hyper_test = defaultdict(int)
     for idx, (train, test) in enumerate(splited_data):
         for c in self.C:
             for rho_0 in self.RHO_0:
                 weight = self.train(train, rho_0, c, epoch=epoch)
                 precision = self.test(test, weight)
                 print 'done fold %i' % idx, ' on [rho_0: %s, c: %s]' \
                       % (rho_0, c)
                 hyper_test[(rho_0, c)] += precision
     return map(lambda (x, y): (x, y/fold), hyper_test.iteritems())
	def start_producers(self, result_queue):
		jobs = Queue()
		n_workers = params.N_PRODUCERS
		batch_count = 0

		#Flag used for keeping values in queue in order
		last_queued_job = Value('i', -1)

		for job_index, batch in enumerate(util.chunks(self.X,self.batch_size)):
			batch_count += 1
			jobs.put( (job_index,batch) )

		# Define producer (putting items into queue)
		def produce(id):
			while True:
				job_index, task = jobs.get()

				if task is None:
					#print id, " fully done!"
					break

				result = self.gen(task)

				while(True):
					#My turn to add job done
					if last_queued_job.value == job_index-1:

						with last_queued_job.get_lock():
							result_queue.put(result)
							last_queued_job.value += 1
							#print id, " worker PUT", job_index
							break

		#Start workers
		for i in xrange(n_workers):

			if params.MULTIPROCESS:
				p = Process(target=produce, args=(i,))
			else:
				p = Thread(target=produce, args=(i,))

			p.daemon = True
			p.start()

		#Add poison pills to queue (to signal workers to stop)
		for i in xrange(n_workers):
			jobs.put((-1,None))


		return batch_count, jobs
def profile(subset=1000, multi=True, n_threads = 4, batch_size=64, thread_pool=False):

    # Load a bunch of imagenames
    y = util.load_labels()
    y = y[:subset]
    keys = y.index.values

    #Create sublists (batches)
    batched_keys = util.chunks(keys, batch_size)

    if multi:
        augment_multithreaded(batched_keys, n_threads=n_threads, thread_pool=thread_pool)
    else:
        augment_singlethreaded(batched_keys)
Beispiel #16
0
def threshold_optimization(p, y):
    print "Optimizing threshold"
    y_images = util.chunks(y, 384*512)

    def dice_objective(threshold):
        p_binary = np.where(p > threshold, 1,0)
        p_images_binary = util.chunks(p_binary, 384*512)

        mean, std, dices = dice(p_images_binary, y_images)
        return -mean

    x, v, message = scipy.optimize.fmin_l_bfgs_b(dice_objective, 0.5, approx_grad=True, bounds=[(0, 1)], epsilon=1e-03)
    print "Optimized, threshold {0}, ? {1}, termination because {2}".format(x,v,message)
    return x[0]
Beispiel #17
0
	def refresh_job(self, j):
		j.extranonce2 = self.increment_nonce(j.extranonce2)
		coinbase = j.coinbase1 + self.extranonce + j.extranonce2 + j.coinbase2
		merkle_root = sha256(sha256(unhexlify(coinbase)).digest()).digest()

		for hash_ in j.merkle_branch:
			merkle_root = sha256(sha256(merkle_root + unhexlify(hash_)).digest()).digest()
		merkle_root_reversed = ''
		for word in chunks(merkle_root, 4):
			merkle_root_reversed += word[::-1]
		merkle_root = hexlify(merkle_root_reversed)

		j.block_header = ''.join([j.version, j.prevhash, merkle_root, j.ntime, j.nbits])
		j.time = time()
		return j
Beispiel #18
0
    def call(self, orderlist):
        assert isinstance(orderlist, list)
        orders = {}
        MAXORDERS = 50
        for ol in util.chunks(orderlist, MAXORDERS):        
            # make BDAQ representation of orders from orderlist past
            self.req.Orders.Order = self.makeorderlist(ol)
            apilog.info('calling BDAQ Api PlaceOrdersNoReceipt')
            result = self.client.service.PlaceOrdersNoReceipt(self.req)
            ors = apiparse.ParsePlaceOrdersNoReceipt(result, orderlist)
            orders.update(ors)

        # note: could put result.Timestamp in order object so that we
        # are saving the BDAQ time.
        return orders
Beispiel #19
0
def status_iter(iterable, callback, chunksize=1, reportsize=10):
    itersize = len(iterable)
    starttime = time.time()
    for i, item in enumerate(util.chunks(iterable, chunksize), 1):
        callback(item)
        if i % reportsize == 0:
            done = i * chunksize
            nowtime = time.time()
            numblocks = itersize * 1.0 / (reportsize*chunksize)
            curblock = done / (reportsize*chunksize)
            position = curblock / numblocks
            duration = round(nowtime - starttime)
            durdelta = datetime.timedelta(seconds=duration)
            remaining = round((duration / position) - duration)
            remdelta = datetime.timedelta(seconds=remaining)
            lookuplog.info("Done %s/%s in %s; %s remaining", done, itersize, str(durdelta), str(remdelta))
    lookuplog.info("Finished")
Beispiel #20
0
def nfold_cross_validate(data, n=4):
    data_chunks = chunks(data, len(data) / n)

    rmse_values = []
    for i in range(n):
        train_set = flatten(data_chunks[:i] + data_chunks[i + 1:])
        test_set = data_chunks[i]
        classif = nltk.MaxentClassifier.train(train_set)   
        
        test_fs, test_ratings = zip(*test_set)
        results = classif.batch_classify(test_fs)
        set_rmse = rmse(test_ratings, results)
        print 'RMSE: ', set_rmse

        rmse_values.append(set_rmse)
    
    print 'Average RMSE:', sum(rmse_values) / float(len(rmse_values))
Beispiel #21
0
    def submit_events(self, events):
        headers = {"Content-Type": "application/json"}
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                "apiKey": self.api_key,
                "events": {"api": chunk},
                "uuid": get_uuid(),
                "internalHostname": get_hostname(),
            }
            params = {}
            if self.api_key:
                params["api_key"] = self.api_key
            url = "%s/intake?%s" % (self.api_host, urlencode(params))

            self.submit_http(url, json.dumps(payload), headers)
Beispiel #22
0
    def submit_events(self, events):
        headers = {'Content-Type':'application/json'}
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '%s/intake?%s' % (self.api_host, urlencode(params))

            self.submit_http(url, json.dumps(payload), headers)
Beispiel #23
0
	def _start_producers(self, result_queue):
		jobs = Queue()
		n_workers = self.n_producers
		batch_count = 0

		# Flag used for keeping values in queue in order
		last_queued_job = Value('i', -1)

		chunks = util.chunks(self.X,self.batch_size)


		# Add jobs to queue
		for job_index, X_batch in enumerate(chunks):
			batch_count += 1
			jobs.put( (job_index,X_batch) )

		# Add poison pills to queue (to signal workers to stop)
		for i in xrange(n_workers):
			jobs.put((-1,None))

		# Define producer function
		produce = partial(_produce_helper,
			generator=self.generator,
			jobs=jobs,
			result_queue=result_queue,
			last_queued_job=last_queued_job,
			ordered=self.ordered)

		# Start worker processes or threads
		for i in xrange(n_workers):
			name = "ParallelBatchIterator worker {0}".format(i)

			if self.multiprocess:
				p = Process(target=produce, args=(i,), name=name)
			else:
				p = Thread(target=produce, args=(i,), name=name)

			# Make the process daemon, so the main process can die without these finishing
			#p.daemon = True
			p.start()

		return batch_count, jobs
Beispiel #24
0
    def call(self, mids):
        """
        Return all selections for Market ids in mids, where mids is a
        list of market ids.
        """

        allselections = []
        # split up mids into groups of size MAXMIDS
        for (callnum, ids) in \
            enumerate(util.chunks(mids, ApiGetPrices.MAXMIDS)):
            self.req.MarketIds = ids
            if callnum > 0:
                # sleep for some time before calling Api again
                time.sleep(self.throttl)
                
            apilog.info('calling BDAQ Api GetPrices')        
            result = self.client.service.GetPrices(self.req)
            selections =  apiparse.ParseGetPrices(ids, result)
            allselections = allselections + selections

        return allselections
Beispiel #25
0
def threshold_optimization_naive(p,y):
    print "Optimizing threshold"
    y_images = util.chunks(y, 384*512)

    candidates = np.arange(0.25,0.75,1/2500)

    def dice_objective(threshold):
        p_binary = np.where(p > threshold, 1,0)
        p_images_binary = util.chunks(p_binary, 384*512)

        mean, std, dices = dice(p_images_binary, y_images)
        return mean

    #score = map(dice_objective,tqdm(candidates))
    scores = []
    for t in tqdm(candidates):
        score = dice_objective(t)
        scores.append(score)
    print np.argmax(scores)
    threshold = candidates[np.argmax(scores)]
    print "Best threshold ", threshold
    return threshold
Beispiel #26
0
def lookup():
    """ returns (done, remaining)"""
    songs = db.data.get_pending_songs()
    songcount = db.data.get_count_pending_songs()

    if not songs:
        return (0, 0)

    # We can use a with statement to ensure threads are cleaned up promptly
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Start the load operations and mark each future with its URL
        i = 0
        future_to_song = {}
        for songchunk in util.chunks(songs, 10):
            future_to_song[executor.submit(query, songchunk, i)] = songchunk
            i = 1 - i

        for future in concurrent.futures.as_completed(future_to_song):
            songchunk = future_to_song[future]
            # For each set of songs, get them from the response
            # for songs not in the response, add an empty response
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (songchunk, exc))
            else:
                gotsongs = set()
                waitings = set(songchunk)
                results = data["response"].get("songs", [])
                for s in results:
                    songid = s["id"]
                    gotsongs.add(songid)
                    response = {"response": {"songs": [s], "status": data["response"]["status"]}}
                    db.data.add_response_if_not_exists(echonest.SONG_PROFILE, songid, response)
                nosongs = waitings-gotsongs
                for s in list(nosongs):
                    db.data.add_response_if_not_exists(echonest.SONG_PROFILE, s, {})

    return (len(songs), songcount-len(songs))
def filter_and_lemma(chunk_size=2000):
    files = glob.glob(INPUT_FOLDER+'*.frog.out')

    lemmatized = {}

    #Split all files in the list into chunks
    file_chunks = util.chunks(files, chunk_size)

    for i, chunk in enumerate(tqdm(file_chunks)):
        pool = Pool(processes=util.CPU_COUNT)
        filtered_lemmatized = pool.map(process, chunk)
        pool.close()

        for filename, value in zip(chunk, filtered_lemmatized):
            file_id = util.filename_without_extension(filename, '.frog.out')
            lemmatized[file_id] = value

    #Order by key
    ordered = OrderedDict(sorted(lemmatized.items()))

    with open(DATA_FOLDER+'processed.p','w') as f:
        pickle.dump(ordered,f)
    print "Done!"
Beispiel #28
0
    def submit_events(self, events):
        headers = {'Content-Type':'application/json'}
        method = 'POST'

        events_len = len(events)
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '/intake?%s' % urlencode(params)

            status = None
            conn = self.http_conn_cls(self.api_host)
            try:
                start_time = time()
                conn.request(method, url, json.dumps(payload), headers)

                response = conn.getresponse()
                status = response.status
                response.close()
                duration = round((time() - start_time) * 1000.0, 4)
                log.debug("%s %s %s%s (%sms)" % (
                                status, method, self.api_host, url, duration))

            finally:
                conn.close()
Beispiel #29
0
def getActivations(x_train, numActivationTrainingInstances, model, dnnModel,
                   y_train):
    util.thisLogger.logInfo(
        "------ start of activation data extraction for training data -------")
    startTime = datetime.datetime.now()

    # Only get activations from the instances that are correctly classified
    y_predict = np.argmax(dnnModel.predict(x_train), axis=1)

    # The DNN is trained to output 0 or 1 only.
    # get the original classes it was trained on and transform the outputs
    classes = util.getParameter('DataClasses')
    classes = np.asarray(classes.replace('[', '').replace(
        ']', '').split(',')).astype(int)
    util.thisLogger.logInfo('Data classes to be used: %s' % (classes))
    count = 0
    for c in classes:
        y_predict = np.where(y_predict == count, c, y_predict)
        count += 1

    incorrectPredictIndexes = []
    for i in range(0, len(y_predict) - 1):
        if (y_predict[i] != y_train[i]):
            incorrectPredictIndexes.append(i)

    x_train = np.delete(x_train, incorrectPredictIndexes, axis=0)
    y_train = np.delete(y_train, incorrectPredictIndexes, axis=0)
    y_predict = np.delete(y_predict, incorrectPredictIndexes, axis=0)

    # train in batches
    activationTrainingBatchSize = util.getParameter(
        'ActivationTrainingBatchSize')

    if numActivationTrainingInstances == -1:
        numActivationTrainingInstances = len(x_train)

    xData = x_train[:numActivationTrainingInstances, ]
    batchData = list(util.chunks(xData, activationTrainingBatchSize))

    activationData = []
    numBatches = len(batchData)
    batchActivationData = [[] for i in range(numBatches)]
    for batchIndex in range(numBatches):
        batch = batchData[batchIndex]
        util.thisLogger.logInfo("Training batch " + str(batchIndex + 1) +
                                " of " + str(len(batchData)) + " (" +
                                str(len(batch)) + " instances)")
        # Get activations and set up streams for the training data
        # get reduced activations for all training data in one go

        # Train in a loop
        util.thisLogger.logInfo(
            str(len(batch)) + " instances selected from training data")

        activations, numLayers = extract.getActivationData(model, batch)
        batchActivationData[batchIndex].append(activations)
        activationData.append(activations)

        util.thisLogger.logInfo(
            "Filter Layers: DNN has %s activation layers, getting activation data for %s instances."
            % (numLayers, len(batch)))

    endTime = datetime.datetime.now()
    util.thisLogger.logInfo('Total training time: ' + str(endTime - startTime))
    util.thisLogger.logInfo(
        "------- end of activation data extraction for training data --------")
    util.thisLogger.logInfo("")

    return numLayers, batchData, activationData, batchActivationData
Beispiel #30
0
def NARR_to_EPIC(vals):
    lat,lon = vals
    # Output pandas frame into EPIC weather file
    out_fl   = constants.epic_dly+os.sep+str(lat)+'_'+str(lon)+'.txt'

    if not(os.path.isfile(out_fl)):
        logging.info(out_fl) 
        # List all years for which we will create EPIC file
        lst_yrs     = rrule(YEARLY, dtstart=constants.strt_date, until=constants.end_date)

        # Create pandas data frame, fill with 0.0s, for 1st year.
        epic_df = pandas.DataFrame(index=pandas.date_range(constants.strt_date,constants.end_date),\
                                   columns=[constants.vars_to_get.keys()])
        epic_out = open(out_fl,'w')

        # Loop across years
        for idx_yr in range(lst_yrs.count()):		
            cur_strt_date  = datetime.date(lst_yrs[idx_yr].year,1,1)
            cur_end_date   = datetime.date(lst_yrs[idx_yr].year,12,31)
            cur_date_range = pandas.date_range(cur_strt_date,cur_end_date)

            tmp_df         = pandas.DataFrame(index=cur_date_range,columns=[constants.vars_to_get.keys()])
            tmp_df.fillna(0.0,inplace=True)
            # Loop across variables
            for cur_var in constants.vars_to_get.keys():
                e_fl      = open(constants.data_dir + os.sep + 'Data' + os.sep + cur_var + os.sep + str(lst_yrs[idx_yr].year)+\
                                      os.sep + str(lat) + '_' + str(lon) + '.txt')
                epic_vars = filter(None,e_fl.readlines()[0].strip().split("'"))

                if cur_var == 'air.2m':
                    epic_min_tmp     = util.chunks(epic_vars,8,True)
                    epic_max_tmp     = util.chunks(epic_vars,8,False)

                    tmp_df[cur_var] = pandas.Series(epic_min_tmp,index=cur_date_range)
                    tmp_df[cur_var] = tmp_df[cur_var].map(lambda x:float(x)+constants.K_To_C)

                    tmp_df['tmax']  = pandas.Series(epic_max_tmp,index=cur_date_range)
                    tmp_df['tmax']  = tmp_df['tmax'].map(lambda x:float(x)+constants.K_To_C)
                    tmp_df['tmin']  = tmp_df['air.2m'] 
                else:
                    tmp_df[cur_var] = pandas.Series(epic_vars,index=cur_date_range)
                    tmp_df[cur_var] = tmp_df[cur_var].map(lambda x:float(x))
        
            # Get into right units
            tmp_df['wnd']      = pandas.Series(tmp_df['uwnd.10m'].astype(float)**2.0+\
                                                tmp_df['vwnd.10m'].astype(float)**2.0,index=tmp_df.index)
            tmp_df['wnd']      = tmp_df['wnd']**0.5
            tmp_df['rhum.2m']  = tmp_df['rhum.2m'].map(lambda x:float(x)/100.0)
            tmp_df['swr_diff'] = pandas.Series(tmp_df['dswrf']-tmp_df['uswrf.sfc'],index=tmp_df.index)
            tmp_df['srad']     = tmp_df['swr_diff'].map(lambda x:constants.WMsq_MjMsq*x)
            tmp_df['year']     = tmp_df.index.year
            tmp_df['month']    = tmp_df.index.month
            tmp_df['day']      = tmp_df.index.day
            epic_df            = epic_df.combine_first(tmp_df)
        # Output dataframe to text file with right formatting
        for index, row in epic_df.iterrows():
            epic_out.write(('%6d%4d%4d'+6*'%6.2f'+'\n') %
                        (row['year'],row['month'],row['day'],
                         row['srad'],row['tmax'],row['tmin'],
                         row['apcp'],row['rhum.2m'],row['wnd']))
        epic_out.close()
    else:
        logging.info('File exists: '+out_fl) 
def main(args_list: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Autoencoder for coq terms")
    add_std_args(parser)
    parser.add_argument("--gamma", default=.9, type=float)
    parser.add_argument("--epoch-step", default=5, type=int)
    parser.add_argument("--num-decoder-layers",
                        dest="num_decoder_layers",
                        default=3,
                        type=int)
    args = parser.parse_args(args_list)
    curtime = time.time()
    print("Loading data...", end="")
    sys.stdout.flush()
    dataset = list(
        itertools.islice(read_text_data(args.scrape_file), args.max_tuples))

    print(" {:.2f}s".format(time.time() - curtime))
    curtime = time.time()
    print("Extracting terms...", end="")
    sys.stdout.flush()
    term_strings = list(
        chain.from_iterable(
            [[hyp.split(":")[1].strip()
              for hyp in datum.context.focused_hyps] +
             [datum.context.focused_goal] for datum in dataset]))
    print(" {:.2f}s".format(time.time() - curtime))

    curtime = time.time()
    print("Building tokenizer...", end="")
    sys.stdout.flush()
    tokenizer = tk.make_keyword_tokenizer_topk(term_strings,
                                               tk.tokenizers[args.tokenizer],
                                               args.num_keywords, 2)
    print(" {:.2f}s".format(time.time() - curtime))
    curtime = time.time()
    print("Tokenizing {} strings...".format(len(term_strings)), end="")
    sys.stdout.flush()

    with multiprocessing.Pool(None) as pool:
        tokenized_data_chunks = pool.imap_unordered(
            functools.partial(use_tokenizer, tokenizer, args.max_length),
            chunks(term_strings, 32768))
        tokenized_data = list(chain.from_iterable(tokenized_data_chunks))

    print(" {:.2f}s".format(time.time() - curtime))
    checkpoints = train(tokenized_data, tokenizer.numTokens(), args.max_length,
                        args.hidden_size, args.learning_rate, args.epoch_step,
                        args.gamma, args.num_encoder_layers,
                        args.num_decoder_layers, args.num_epochs,
                        args.batch_size, args.print_every,
                        optimizers[args.optimizer])
    for epoch, (encoder_state, decoder_state,
                training_loss) in enumerate(checkpoints):
        state = {
            'epoch': epoch,
            'training-loss': training_loss,
            'tokenizer': tokenizer,
            'tokenizer-name': args.tokenizer,
            'optimizer': args.optimizer,
            'learning-rate': args.learning_rate,
            'encoder': encoder_state,
            'decoder': decoder_state,
            'num-encoder-layers': args.num_encoder_layers,
            'num-decoder-layers': args.num_decoder_layers,
            'max-length': args.max_length,
            'hidden-size': args.hidden_size,
            'num-keywords': args.num_keywords,
            'context-filter': args.context_filter,
        }
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".format(epoch))
            torch.save(state, f)
    pass
Beispiel #32
0
 def expect_layout(self, layout):
     compressed = lzma.compress(layout.encode("utf-8"))
     self.expect("FE01", struct.pack("<I", len(compressed)))
     for idx, chunk in enumerate(chunks(compressed, 32)):
         self.expect(struct.pack("<BBI", 0xFE, 0x02, idx), chunk)
Beispiel #33
0
def NARR_to_EPIC(vals):
    lat, lon = vals
    # Output pandas frame into EPIC weather file
    out_fl = constants.epic_dly + os.sep + str(lat) + '_' + str(lon) + '.txt'

    if not (os.path.isfile(out_fl)):
        logging.info(out_fl)
        # List all years for which we will create EPIC file
        lst_yrs = rrule(YEARLY,
                        dtstart=constants.strt_date,
                        until=constants.end_date)

        # Create pandas data frame, fill with 0.0s, for 1st year.
        epic_df = pandas.DataFrame(index=pandas.date_range(constants.strt_date,constants.end_date),\
                                   columns=[constants.vars_to_get.keys()])
        epic_out = open(out_fl, 'w')

        # Loop across years
        for idx_yr in range(lst_yrs.count()):
            cur_strt_date = datetime.date(lst_yrs[idx_yr].year, 1, 1)
            cur_end_date = datetime.date(lst_yrs[idx_yr].year, 12, 31)
            cur_date_range = pandas.date_range(cur_strt_date, cur_end_date)

            tmp_df = pandas.DataFrame(index=cur_date_range,
                                      columns=[constants.vars_to_get.keys()])
            tmp_df.fillna(0.0, inplace=True)
            # Loop across variables
            for cur_var in constants.vars_to_get.keys():
                e_fl      = open(constants.data_dir + os.sep + 'Data' + os.sep + cur_var + os.sep + str(lst_yrs[idx_yr].year)+\
                                      os.sep + str(lat) + '_' + str(lon) + '.txt')
                epic_vars = filter(None,
                                   e_fl.readlines()[0].strip().split("'"))

                if cur_var == 'air.2m':
                    epic_min_tmp = util.chunks(epic_vars, 8, True)
                    epic_max_tmp = util.chunks(epic_vars, 8, False)

                    tmp_df[cur_var] = pandas.Series(epic_min_tmp,
                                                    index=cur_date_range)
                    tmp_df[cur_var] = tmp_df[cur_var].map(
                        lambda x: float(x) + constants.K_To_C)

                    tmp_df['tmax'] = pandas.Series(epic_max_tmp,
                                                   index=cur_date_range)
                    tmp_df['tmax'] = tmp_df['tmax'].map(
                        lambda x: float(x) + constants.K_To_C)
                    tmp_df['tmin'] = tmp_df['air.2m']
                else:
                    tmp_df[cur_var] = pandas.Series(epic_vars,
                                                    index=cur_date_range)
                    tmp_df[cur_var] = tmp_df[cur_var].map(lambda x: float(x))

            # Get into right units
            tmp_df['wnd']      = pandas.Series(tmp_df['uwnd.10m'].astype(float)**2.0+\
                                                tmp_df['vwnd.10m'].astype(float)**2.0,index=tmp_df.index)
            tmp_df['wnd'] = tmp_df['wnd']**0.5
            tmp_df['rhum.2m'] = tmp_df['rhum.2m'].map(
                lambda x: float(x) / 100.0)
            tmp_df['swr_diff'] = pandas.Series(tmp_df['dswrf'] -
                                               tmp_df['uswrf.sfc'],
                                               index=tmp_df.index)
            tmp_df['srad'] = tmp_df['swr_diff'].map(
                lambda x: constants.WMsq_MjMsq * x)
            tmp_df['year'] = tmp_df.index.year
            tmp_df['month'] = tmp_df.index.month
            tmp_df['day'] = tmp_df.index.day
            epic_df = epic_df.combine_first(tmp_df)
        # Output dataframe to text file with right formatting
        for index, row in epic_df.iterrows():
            epic_out.write(('%6d%4d%4d' + 6 * '%6.2f' + '\n') %
                           (row['year'], row['month'], row['day'], row['srad'],
                            row['tmax'], row['tmin'], row['apcp'],
                            row['rhum.2m'], row['wnd']))
        epic_out.close()
    else:
        logging.info('File exists: ' + out_fl)
    def maximize(self):
        print 'mini-batch gd: examples = {}, batch size = {}'.format(len(self.train), self.batch_size)

        # these are for multithreading
        q_in = Queue()
        q_out = Queue()

        def worker():
            while True:
                ex = q_in.get()
                q_out.put(self.objective.gradient(self.params, ex))
                q_in.task_done()

        # launch workers
        for i in range(self.num_threads):
            t = threading.Thread(target=worker)
            t.daemon = True
            t.start()

        # no. of mini-batch steps taken
        self.steps = 0
        while True:
            # form fresh batches
            train_copy = list(self.train)
            random.shuffle(train_copy)
            batches = list(util.chunks(train_copy, self.batch_size))

            for batch in batches:
                grad = SparseVector()

                if self.num_threads == 1:
                    for ex in batch:
                        grad_ex = self.objective.gradient(self.params, ex)
                        grad += grad_ex
                else:
                    # WARNING: this is only safe if examples in the batch are mutually exclusive
                    for ex in batch:
                        q_in.put(ex)
                    q_in.join()
                    while not q_out.empty():
                        grad += q_out.get()

                for frozen in self.freeze_params:
                    grad.remove(frozen)

                # normalize by batch size
                grad *= 1.0 / len(batch)

                # add regularization gradient
                if self.l1_reg != 0.0 or self.l2_reg != 0.0:
                    reg_grad = self.reg_gradient(self.params, grad, self.approx_reg)
                    grad += reg_grad

                # record gradient norm, before gradient gets modified by various algorithms
                self.gnorm = grad.norm2()

                delta = grad

                # check if Adagrad controller is begin used
                adagrad = next((controller for controller in self.controllers if isinstance(controller, AdaGrad)), None)
                if adagrad is None:
                    delta *= self.step_size
                    self.delta = delta
                else:
                    # this controller will modify self.delta
                    self.delta = delta
                    adagrad.control(self)

                # these controllers will modify self.delta, and maybe also self.halt
                for controller in self.controllers:
                    if isinstance(controller, AdaGrad):
                        continue
                    controller.control(self)

                # update params
                self.params += self.delta

                # check if unit normalization controller
                unit_norm = next((controller for controller in self.controllers if isinstance(controller, UnitNorm)), None)
                if unit_norm is not None:
                    unit_norm.control(self)

                self.track()

                self.steps += 1

                if self.halt:
                    return self.params
Beispiel #35
0
def plot_importances(article_sents,
                     importances,
                     abstracts_text,
                     save_location=None,
                     save_name=None):
    plt.ioff()
    sents_per_figure = 40
    max_importance = np.max(importances)
    chunked_sents = util.chunks(article_sents, sents_per_figure)
    chunked_importances = util.chunks(importances, sents_per_figure)

    for chunk_idx in range(len(chunked_sents)):
        my_article_sents = chunked_sents[chunk_idx]
        my_importances = chunked_importances[chunk_idx]

        if len(my_article_sents) < sents_per_figure:
            my_article_sents += [''
                                 ] * (sents_per_figure - len(my_article_sents))
            my_importances = np.concatenate([
                my_importances,
                np.zeros([sents_per_figure - len(my_importances)])
            ])

        y_pos = np.arange(len(my_article_sents))
        fig, ax1 = plt.subplots()
        fig.subplots_adjust(left=0.9, top=1.0, bottom=0.03, right=1.0)
        ax1.barh(y_pos,
                 my_importances,
                 align='center',
                 color='green',
                 ecolor='black')
        ax1.set_yticks(y_pos)
        ax1.set_yticklabels(my_article_sents)
        ax1.invert_yaxis()  # labels read top-to-bottom
        ax1.set_xlabel('Performance')
        ax1.set_title('How fast do you want to go today?')
        ax1.set_xlim(right=max_importance)

        fig.set_size_inches(18.5, 10.5)
        plt.savefig(
            os.path.join(save_location,
                         save_name + '_' + str(chunk_idx) + '.jpg'))
        plt.close(fig)

    plt.figure()
    fig_txt = tw.fill(tw.dedent(abstracts_text), width=80)
    plt.figtext(0.5,
                0.5,
                fig_txt,
                horizontalalignment='center',
                fontsize=9,
                multialignment='left',
                bbox=dict(boxstyle="round",
                          facecolor='#D8D8D8',
                          ec="0.5",
                          pad=0.5,
                          alpha=1),
                fontweight='bold')
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.savefig(
        os.path.join(save_location,
                     save_name + '_' + str(chunk_idx + 1) + '.jpg'))
    plt.close(fig)
        resvar = np.asarray([np.linalg.norm(r)**2 for r in R])
        losses.append(np.sum(resvar))
        D2 = np.diag(1 / resvar)
        precision2 = D2 @ (np.identity(n) - B)

        err = (precision2 - precision)
        loss2 = np.trace(err @ err.T)
        B = B - lr * G
        print(loss2)

    test_points = 10
    losses = np.asarray(losses)[:test_points]
    target_losses = [
        118., 41.150800000000004, 33.539355199999996, 29.747442032320002,
        27.450672271574934, 25.95846376879459, 24.917943341139274,
        24.139761502111114, 23.519544126307142, 22.998235729589265
    ]

    u.check_equal(losses[:test_points], target_losses[:test_points])
    print('mismatch is ', np.max(losses - target_losses))


if __name__ == '__main__':
    numbers = [(x + 1)**3 for x in range(16)]
    list(u.chunks(numbers, 4))
    X = np.array(list(u.chunks(numbers, 4)))

    X = np.asarray([[5, 1, 0, 4], [0, 4, 1, 2], [1, 0, 3, 3], [4, 2, 0, 4]])
    test_numpy(X)
Beispiel #37
0
lastTimeStamp = None

lnameDict = lname(s.LDBPATH)
connections = {}
whoCache = {}
hostnames = []
hostnameToCluster = {}
for cluster in s.MACHINES['clusters']:
    if cluster not in whoCache:
        whoCache[cluster] = OrderedDict()
    for hostname in s.MACHINES['clusters'][cluster]['hostnames']:
        hostnames.append(hostname)
        hostnameToCluster[hostname] = cluster

hostnamesChunked = list(util.chunks(hostnames, len(hostnames)//s.THREADS))
threads = []
clients = []
thread_times = []

def sshAndGetWho(client, hostname):
    #s.log('sshing into %s', hostname)
    who = []
    try:
        client.connect(
            hostname,
            username=s.USERNAME,
            password=s.PASSWORD,
        )
        stdin, stdout, stderr = client.exec_command('w')
        # get rid of first two lines of w output
Beispiel #38
0
def enqueue_jobs(cls,
         method,
         ids_q_or_list,
         queue_number,
         use_rq=True,
         append=False,
         chunk_size=25,
         shortcut_fn=None
    ):
    """
    Takes sqlalchemy query with IDs, runs fn on those repos.
    """

    shortcut_data = None
    if use_rq:
        if shortcut_fn:
            raise ValueError("you can't use RQ with a shortcut_fn")

    else:
        if shortcut_fn:
            shortcut_data_start = time()
            logger.info(u"Getting shortcut data...")
            shortcut_data = shortcut_fn()
            logger.info(u"Got shortcut data in {} seconds".format(
                elapsed(shortcut_data_start)
            ))

    chunk_size = int(chunk_size)


    start_time = time()
    new_loop_start_time = time()
    index = 0

    try:
        logger.info(u"running this query: \n{}\n".format(
            ids_q_or_list.statement.compile(dialect=postgresql.dialect())))
        row_list = ids_q_or_list.all()

    except AttributeError:
        logger.info(u"running this query: \n{}\n".format(ids_q_or_list))
        row_list = db.engine.execute(sql.text(ids_q_or_list)).fetchall()

    if row_list is None:
        logger.info(u"no IDs, all done.")
        return None

    logger.info(u"finished enqueue_jobs query in {} seconds".format(elapsed(start_time)))
    object_ids = [row[0] for row in row_list]

    # do this as late as possible so things can keep using queue
    if use_rq:
        if append:
            logger.info(u"not clearing queue.  queue currently has {} jobs".format(ti_queues[queue_number].count))
        else:
            empty_queue(queue_number)


    num_items = len(object_ids)
    logger.info(u"adding {} items to queue...".format(num_items))

    # iterate through chunks of IDs like [[id1, id2], [id3, id4], ...  ]
    object_ids_chunk = []

    for object_ids_chunk in chunks(object_ids, chunk_size):

        update_fn_args = [cls, method, object_ids_chunk]

        if use_rq:
            job = ti_queues[queue_number].enqueue_call(
                func=update_fn,
                args=update_fn_args,
                timeout=60 * 10,
                result_ttl=0  # number of seconds
            )
            job.meta["object_ids_chunk"] = object_ids_chunk
            job.save()
            # logger.info(u"saved job {}".format(job))
        else:
            update_fn_args.append(shortcut_data)
            update_fn(*update_fn_args, index=index)

        if True: # index % 10 == 0 and index != 0:
            num_jobs_remaining = num_items - (index * chunk_size)
            try:
                jobs_per_hour_this_chunk = chunk_size / float(elapsed(new_loop_start_time) / 3600)
                predicted_mins_to_finish = round(
                    (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60,
                    1
                )
                logger.info(u"\n\nWe're doing {} jobs per hour. At this rate, done in {}min".format(
                    int(jobs_per_hour_this_chunk),
                    predicted_mins_to_finish
                ))
                logger.info(u"(finished chunk {} of {} chunks in {} seconds total, {} seconds this loop)\n".format(
                    index,
                    num_items/chunk_size,
                    elapsed(start_time),
                    elapsed(new_loop_start_time)
                ))
            except ZeroDivisionError:
                # logger.info(u"not printing status because divide by zero")
                logger.info(u"."),


            new_loop_start_time = time()
        index += 1
    logger.info(u"last chunk of ids: {}".format(list(object_ids_chunk)))

    db.session.remove()  # close connection nicely
    return True
Beispiel #39
0
def correct_raw_data(raw_data_path,
                     channel,
                     subsample_factor=2,
                     log_s3_path=None,
                     background_correction=True):

    total_n_jobs = cpu_count()
    # overwrite existing raw data with corrected data
    outdir = raw_data_path

    # get list of all tiles to correct for  given channel
    all_files = np.sort(glob.glob(f'{raw_data_path}/*/*.tiff'))
    if background_correction:
        background_val = get_background_value(raw_data_path)
    total_files = len(all_files)

    bias_path = f'{outdir}/CHN0{channel}_bias.tiff'
    if os.path.exists(bias_path):
        bias = tf.imread(bias_path)

    else:
        # subsample tiles
        files_cb = all_files[::subsample_factor]
        num_files = len(files_cb)

        # compute running sums in parallel
        sums = Parallel(total_n_jobs, verbose=10)(
            delayed(sum_tiles)(f)
            for f in chunks(files_cb,
                            math.ceil(num_files // (total_n_jobs)) + 1))
        sums = [i[:, :, None] for i in sums]
        mean_tile = np.squeeze(np.sum(np.concatenate(sums, axis=2),
                                      axis=2)) / num_files
        if background_correction:
            # subtract background out from bias correction
            mean_tile -= background_val
        mean_tile = sitk.GetImageFromArray(mean_tile)

        # get the bias correction tile using N4ITK
        bias = sitk.GetArrayFromImage(get_bias_field(mean_tile, scale=1.0))

        # save bias tile to local directory
        tf.imsave(bias_path, bias.astype('float32'))

    # save bias tile to S3
    if log_s3_path:
        s3 = boto3.resource('s3')
        img = Image.fromarray(bias)
        fp = BytesIO()
        img.save(fp, format='TIFF')
        # reset pointer to beginning  of file
        fp.seek(0)
        log_s3_url = S3Url(log_s3_path.strip('/'))
        bias_path = f'{log_s3_url.key}/CHN0{channel}_bias.tiff'
        s3.Object(log_s3_url.bucket, bias_path).upload_fileobj(fp)

    # correct all the files and save them
    files_per_proc = math.ceil(total_files / total_n_jobs) + 1
    work = chunks(all_files, files_per_proc)
    with tqdm_joblib(tqdm(desc="Correcting tiles",
                          total=total_n_jobs)) as progress_bar:
        Parallel(n_jobs=total_n_jobs, verbose=10)(
            delayed(correct_tiles)(files, outdir, bias, background_val)
            for files in work)
Beispiel #40
0
 def queue_work(self, work, miner=None):
     target = ''.join(
         list(chunks('%064x' % self.server_difficulty, 2))[::-1])
     self.switch.queue_work(self, work.block_header, target, work.job_id,
                            work.extranonce2, miner)
Beispiel #41
0
 def subwindow_shape(self):
     return tuple((b-a for a, b in util.chunks(self.subwindow, 2)))
Beispiel #42
0
    def build_lines(self):

        self._lines = chunks(self._text, self.draw_width)
        self.scroll["maxCurrentLine"] = len(self._lines)
Beispiel #43
0
def enqueue_jobs(cls,
                 method,
                 ids_q_or_list,
                 queue_number,
                 use_rq=True,
                 chunk_size=25,
                 shortcut_fn=None):
    """
    Takes sqlalchemy query with IDs, runs fn on those repos.
    """

    shortcut_data = None
    if use_rq:
        empty_queue(queue_number)
        if shortcut_fn:
            raise ValueError("you can't use RQ with a shortcut_fn")

    else:
        if shortcut_fn:
            shortcut_data_start = time()
            print "Getting shortcut data..."
            shortcut_data = shortcut_fn()
            print "Got shortcut data in {}sec".format(
                elapsed(shortcut_data_start))

    chunk_size = int(chunk_size)

    start_time = time()
    new_loop_start_time = time()
    index = 0

    print "running this query: \n{}\n".format(
        ids_q_or_list.statement.compile(dialect=postgresql.dialect()))
    row_list = ids_q_or_list.all()
    print "finished query in {}sec".format(elapsed(start_time))
    if row_list is None:
        print "no IDs, all done."
        return None

    object_ids = [row[0] for row in row_list]

    num_jobs = len(object_ids)
    print "adding {} jobs to queue...".format(num_jobs)

    # iterate through chunks of IDs like [[id1, id2], [id3, id4], ...  ]
    object_ids_chunk = []

    for object_ids_chunk in chunks(object_ids, chunk_size):

        update_fn_args = [cls, method, object_ids_chunk]

        if use_rq:
            job = ti_queues[queue_number].enqueue_call(
                func=update_fn,
                args=update_fn_args,
                timeout=60 * 10,
                result_ttl=0  # number of seconds
            )
            job.meta["object_ids_chunk"] = object_ids_chunk
            job.save()
            # print u"saved job {}".format(job)
        else:
            update_fn_args.append(shortcut_data)
            update_fn(*update_fn_args, index=index)

        if True:  # index % 10 == 0 and index != 0:
            num_jobs_remaining = num_jobs - (index * chunk_size)
            try:
                jobs_per_hour_this_chunk = chunk_size / float(
                    elapsed(new_loop_start_time) / 3600)
                predicted_mins_to_finish = round(
                    (num_jobs_remaining / float(jobs_per_hour_this_chunk)) *
                    60, 1)
                print "\n\nWe're doing {} jobs per hour. At this rate, done in {}min".format(
                    int(jobs_per_hour_this_chunk), predicted_mins_to_finish)
                print "(finished chunk {} of {} chunks in {}sec total, {}sec this loop)\n".format(
                    index, num_jobs / chunk_size, elapsed(start_time),
                    elapsed(new_loop_start_time))
            except ZeroDivisionError:
                # print u"not printing status because divide by zero"
                print ".",

            new_loop_start_time = time()
        index += 1
    print "last chunk of ids: {}".format(list(object_ids_chunk))

    db.session.remove()  # close connection nicely
    return True
Beispiel #44
0
def enqueue_jobs(cls,
                 method,
                 ids_q_or_list,
                 queue_number,
                 use_rq="rq",
                 chunk_size=10,
                 shortcut_fn=None):
    """
    Takes sqlalchemy query with (login, repo_name) IDs, runs fn on those repos.
    """

    shortcut_data = None
    if use_rq == "rq":
        empty_queue(queue_number)
        if shortcut_fn:
            raise ValueError("you can't use RQ with a shortcut_fn")

    else:
        if shortcut_fn:
            shortcut_data_start = time()
            print "Getting shortcut data..."
            shortcut_data = shortcut_fn()
            print "Got shortcut data in {}sec".format(
                elapsed(shortcut_data_start))

    chunk_size = int(chunk_size)

    start_time = time()
    new_loop_start_time = time()
    index = 0

    print "running this query: \n{}\n".format(
        ids_q_or_list.statement.compile(dialect=postgresql.dialect()))
    row_list = ids_q_or_list.all()
    print "finished query in {}sec".format(elapsed(start_time))
    if row_list is None:
        print "no IDs, all done."
        return None

    object_ids = [row[0] for row in row_list]

    num_jobs = len(object_ids)
    print "adding {} jobs to queue...".format(num_jobs)

    # iterate through chunks of IDs like [[id1, id2], [id3, id4], ...  ]
    object_ids_chunk = []

    for object_ids_chunk in chunks(object_ids, chunk_size):

        update_fn_args = [cls, method, object_ids_chunk]

        if use_rq == "rq":
            job = ti_queues[queue_number].enqueue_call(
                func=update_fn,
                args=update_fn_args,
                timeout=60 * 10,
                result_ttl=0  # number of seconds
            )
            job.meta["object_ids_chunk"] = object_ids_chunk
            job.save()
        else:
            update_fn_args.append(shortcut_data)
            update_fn(*update_fn_args)

        if index % 1000 == 0 and index != 0:
            print "added {} jobs to queue in {}sec total, {}sec this loop".format(
                index, elapsed(start_time), elapsed(new_loop_start_time))

            new_loop_start_time = time()
        index += 1
    print "last object added to the queue was {}".format(
        list(object_ids_chunk))

    db.session.remove()  # close connection nicely
    return True
Beispiel #45
0
def iterstories(stories, include_tasks=False):
    for s in stories:
        yield s
        if include_tasks:
            for t in chunks(s.tasks, 2):
                yield PivotalTaskPair(t)