def main(): n_max = 10000 # n first bboxes to process n_frames_max = 3000 mask_thresh = 35 videos_dir = "data/stanford_drone/videos/" scene_names = ["bookstore", "coupa", "deathCircle", "gates", "hyang", "little", "nexus", "quad"] # val_videos_paths = [os.path.join(videos_dir, scene, "video0") for scene in scene_names] # show_and_save_video("data/stanford_drone/videos/coupa/video2", n_max, n_frames_max, mask_thresh) # exit(1) # to show [0!(hard dark low res), 1(car),4], 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality), # final to show 1(car), 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality) i_video = 0 for scene in scene_names: scene_subdirs = os.listdir(os.path.join(videos_dir, scene)) for vid_dir_name in scene_subdirs: tracker = SummaryTracker() vid_dir = os.path.join(videos_dir, scene, vid_dir_name) print("-----------------") print(i_video, vid_dir) # "data/stanford_drone/videos/coupa/video2" store_as_dataset(vid_dir, n_max, n_frames_max, mask_thresh) i_video += 1 tracker.print_diff()
def test_SelectMemoryLeak(): N = 50000 def table_set(t, state): fvs = swsscommon.FieldValuePairs([("status", state)]) t.set("123", fvs) def generator_SelectMemoryLeak(): app_db = swsscommon.DBConnector("APPL_DB", 0, True) t = swsscommon.Table(app_db, "TABLE") for i in range(int(N / 2)): table_set(t, "up") table_set(t, "down") tracker = SummaryTracker() appl_db = swsscommon.DBConnector("APPL_DB", 0, True) sel = swsscommon.Select() sst = swsscommon.SubscriberStateTable(appl_db, "TABLE") sel.addSelectable(sst) thr = Thread(target=generator_SelectMemoryLeak) thr.daemon = True thr.start() time.sleep(5) for _ in range(N): state, c = sel.select(1000) diff = tracker.diff() cases = [] for name, count, _ in diff: if count >= N: cases.append("%s - %d objects for %d repeats" % (name, count, N)) thr.join() assert not cases
def get_memory_tracker(): global _memory_tracker if _memory_tracker is None: from pympler.tracker import SummaryTracker _memory_tracker = SummaryTracker() return _memory_tracker
def test_reopt_sizing_pvwatts(solar_resource): round = 0 tracker = SummaryTracker() while round < 25: # multiple runs required to check for memory leaks round += 1 sys = pv.default("PVWattsBatteryCommercial") sys.SolarResource.solar_resource_file = solar_resource batt = bt.from_existing(sys, "PVWattsBatteryCommercial") sys.SolarResource.solar_resource_data = dict({'lat': 3, 'lon': 3}) batt.Battery.crit_load = [0] * 8760 fin = ur.from_existing(sys, "PVWattsBatteryCommercial") post = sys.Reopt_size_battery_post() assert('Scenario' in post['reopt_post']) assert(post['reopt_post']['Scenario']['Site']['latitude'] == pytest.approx(3, 0.1)) tracker_diff = tracker.diff() tracker.print_diff()
def __init__(self, lock, queue, sniffer, analyzer, AerospikeClient): self.lock = lock self.queue = queue self.sniffer = sniffer self.analyzer = analyzer self.timestamp = "" self.logger = logging.getLogger(__name__) self.attack_time, self.attack_percent = ConfigurationReader.scriptExecutionInfo( ) self.first_value = 0 self.attack_alert = 0 self.Aero = AerospikeClient self.attack_alert_limit = ( (self.attack_time * 10) / 100) * self.attack_percent self.custom_script_counter = ConfigurationReader.higherScriptExecution( ) self.script_execution_counter = self.custom_script_counter self.IsAttack = 0 self.selftracker = SummaryTracker()
def start(self): if self.debugging_level >= 1: import cProfile import io import pstats pr = cProfile.Profile() pr.enable() if self.debugging_level >= 2: from pympler.tracker import SummaryTracker tracker = SummaryTracker() # context must be created here because this function may be executed in a separate # process/thread self.connect() heartbeat_ts = time.time() # GC causes unplanned stall and disrupts precisely timed collection. # Disable it and do it manually before sleeping. gc.disable() try: logger.info('MA started') while not self.stopped: if self.collectors: ts = time.time() if ts - ( self.last_collect_second + self.collect_time_decimal) >= self.tick_len - 0.01: # This must be updated *before* collecting to prevent the send time from # slowly drifting away self.last_collect_second = int(ts) result = [] for c in self.collectors: result.extend(c()) logger.info('Collected: ' + str(result)) self.timestamp_and_send_obj(result, ts) else: pass else: self.last_collect_second = time.time() gc.collect() flush_log() # Print out memory usage every minute if self.debugging_level >= 2 and int(time.time()) % 60 == 0: print('Time: ' + time.asctime(time.localtime(time.time()))) tracker.print_diff() # Calculate the precise time for next collection sleep_second = self.last_collect_second + self.collect_time_decimal + self.tick_len - time.time( ) sleep_second = max(sleep_second, 0) sleep_start_ts = time.time() p = dict(self.poller.poll(sleep_second * 1000)) logger.debug('Slept {0} seconds'.format(time.time() - sleep_start_ts)) if self.socket in p: req = pickle.loads(zlib.decompress(self.socket.recv())) assert req[0] == LustreCommon.protocol_ver heartbeat_ts = time.time() if isinstance(req[2], bytes): # this is a command, not data cmd = req[2] if cmd == b'ACTION': action = req[3] if action == 0: logger.info('Received action 0, ignored') else: logger.info( 'Performing action {action}'.format( action=action)) self.controller(req[3:]) elif cmd == b'HB': logger.debug('Received heartbeat') else: logger.warning('Unknown command received: ' + cmd) else: logger.error('Corrupted message received: ' + req) else: if heartbeat_ts and time.time() - heartbeat_ts > 5: # reconnect self.disconnect() self.connect() logger.warning('Connection timeout, reconnected') heartbeat_ts = time.time() logger.info('MA stopped') finally: gc.enable() if self.debugging_level >= 1: pr.disable() s = io.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue())
import psutil from pympler.tracker import SummaryTracker _tr = SummaryTracker() _print_mem_enabled = False _mem_used = 0 _prev_mem_used = 0 def disable_print_mem(): global _print_mem_enabled _print_mem_enabled = False def enable_print_mem(): global _print_mem_enabled _print_mem_enabled = True def print_mem_usage(): global _mem_used, _prev_mem_used, _print_mem_enabled if not _print_mem_enabled: return # _tr.print_diff() mem = psutil.virtual_memory() _prev_mem_used = _mem_used _mem_used = (mem.total - mem.available) / 1024 / 1024 print(f"virt_mem >> used: {_mem_used:.0f}, prev: {_prev_mem_used:.0f}, " f"diff: {_mem_used - _prev_mem_used:.0f}")
from pympler.tracker import SummaryTracker tracker = SummaryTracker() tracker.print_diff() s = "str" tracker.print_diff()
def add_single_experiment(directory, df_filepath, datasets): csv_filenames = [ f for f in os.listdir(directory) if f[0:5] == "resul" and f[-4:] == ".csv" ] print(list(os.walk(directory))) weights_folder = [f for f in os.walk(directory)][0][1][0] for filename in csv_filenames: experiment = directory_to_experiment_info(directory) with open("./" + directory + "/" + filename, "r") as csvfile: logging.debug("./" + directory + "/" + filename) plots = csv.reader(csvfile, delimiter=",") headers = next(plots, None) for header in headers: experiment[header] = [] for row in plots: for i, header in enumerate(headers): experiment[header].append(float(row[i])) experiment["minimum_val_loss"] = min(experiment["val_loss"]) experiment["minimum_loss"] = min(experiment["loss"]) number = int(filename.split("-")[-1].split(".")[0]) experiment["number"] = number experiment["repeat"] = math.floor(number / 4) if (number % 4) / 2 < 1: experiment["elastic_distortions"] = True else: experiment["elastic_distortions"] = False if (number % 4) % 2 != 0: experiment["separate_channel_ops"] = True else: experiment["separate_channel_ops"] = False print(experiment.keys()) if not experiment_in_dataframe(df_filepath, experiment): weights_file = (directory + "/" + weights_folder + "/" + "mask_rcnn_fk2018_best.h5") config = FK2018.FKConfig() print("got config") class InferenceConfig(config.__class__): # Run detection on one image at a time GPU_COUNT = 1 IMAGES_PER_GPU = 1 config = InferenceConfig() config.display() # Device to load the neural network on. # Useful if you're training a model on the same # machine, in which case use CPU and leave the # GPU for training. # DEVICE = "/gpu:0" # /cpu:0 or /gpu:0 # Inspect the model in training or inference modes # values: 'inference' or 'training' # TODO: code for 'training' test mode not ready yet TEST_MODE = "inference" # Must call before using the dataset # with tf.device(DEVICE): tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config).as_default(): model = modellib.MaskRCNN(mode="inference", model_dir="./log", config=config) # Load weights logging.debug("Loading weights " + str(weights_file)) model.load_weights(weights_file, by_name=True) image_ids = datasets[0].image_ids tracker = SummaryTracker() logging.debug("getting stats") logging.debug("tunasand stats") experiment["AP_list"], \ experiment["classless_AP_list"], \ experiment["precision_list"], \ experiment["classless_precision_list"], \ experiment["recall_list"], \ experiment["classless_recall_list"], \ experiment["predicted_class_list"], \ experiment["gt_class_list"], \ experiment["predicted_size_list"], \ experiment["gt_size_list"], \ experiment["overlaps"], \ experiment["classless_overlaps_list"], \ experiment["total_predicted_pixels"], \ experiment["total_groundtruth_pixels"], \ experiment["total_overlapping_pixels"] = compute_both_batch_aps(image_ids, datasets[0], model, config) #get_stats(weights_file, datasets[0]) objgraph.show_most_common_types() roots = objgraph.get_leaking_objects() print(len(roots)) tracker.print_diff() for i, dataset in enumerate( ["AE_area1", "AE_area2", "AE_area3"]): image_ids = datasets[i + 1].image_ids logging.debug(f"aestats, {dataset}") experiment[f"AP_list_{dataset}"], \ experiment[f"classless_AP_list_{dataset}"], \ experiment[f"precision_list_{dataset}"], \ experiment[f"classless_precision_list_{dataset}"], \ experiment[f"recall_list_{dataset}"], \ experiment[f"classless_recall_list_{dataset}"], \ experiment[f"predicted_class_list_{dataset}"], \ experiment[f"gt_class_list_{dataset}"], \ experiment[f"predicted_size_list_{dataset}"], \ experiment[f"gt_size_list_{dataset}"], \ experiment[f"overlaps_{dataset}"], \ experiment[f"classless_overlaps_list_{dataset}"], \ experiment[f"total_predicted_pixels_{dataset}"], \ experiment[f"total_groundtruth_pixels_{dataset}"], \ experiment[f"total_overlapping_pixels_{dataset}"] = compute_both_batch_aps(image_ids, datasets[i+1], model, config) #get_stats(weights_file, datasets[i+1]) objgraph.show_growth() roots = objgraph.get_leaking_objects() print(len(roots)) tracker.print_diff() update_dataframe(df_filepath, experiment) else: print("already in dataframe, skipping " + filename)
import sys, os import time from pympler.tracker import SummaryTracker import testsuites import flaminglog.logrule as flaminglog #runs a memory profiler tracker = SummaryTracker() print(tracker.print_diff()) for i in range(0,100): #runs all tests except cleanup testsuites.run_full_suite(0) if(i%10==0): print(tracker.print_diff()) #sleep to allow any garbage collection to fully finish time.sleep(10) print(tracker.print_diff()) print("If the '# objects' and the 'total size' are both low numbers then" " there is no memory leak")
class GitHubLoader(Cmd): """ GitHub SSH keys loader """ prompt = '$> ' LINK_FACTOR = 70 USER_URL = 'https://api.github.com/users/%s' USERS_URL = 'https://api.github.com/users?since=%s' KEYS_URL = 'https://api.github.com/users/%s/keys' KEYS_ID_URL = 'https://api.github.com/user/%s/keys' def __init__(self, attempts=5, threads=1, state=None, state_file=None, config_file=None, audit_file=None, max_mem=None, users_only=False, merge=False, update_keys=False, *args, **kwargs): Cmd.__init__(self, *args, **kwargs) self.t = Terminal() self.trace_logger = Tracelogger(logger=logger) self.attempts = int(attempts) self.total = None self.terminate = False self.since_id = 0 self.last_users_count = None self.user_lock = Lock() self.processed_user_set = set() self.processed_user_set_lock = Lock() self.max_mem = max_mem self.merge = merge self.update_keys = update_keys self.users_only = users_only self.users_per_page = 30 self.users_bulk_load_pages = 500 self.user_load_bulk = 5000 self.user_refill_lock = Lock() self.state = state self.state_file_path = state_file self.rate_limit_reset = None self.rate_limit_remaining = None self.config = None self.config_file = config_file self.audit_file = audit_file self.audit_records_buffered = [] self.audit_lock = Lock() self.stop_event = threading.Event() self.threads = int(threads) self.link_queue = Queue.PriorityQueue() # Store links to download here self.worker_threads = [] self.state_thread = None self.state_thread_lock = Lock() self.resources_list = [] self.resources_queue = Queue.PriorityQueue() self.local_data = threading.local() self.new_users_events = EvtDequeue() self.new_keys_events = EvtDequeue() self.db_config = None self.engine = None self.session = None self.mem_tracker = None def signal_handler(self, signal, frame): """ Signal handler - terminate gracefully :param signal: :param frame: :return: """ logger.info('CTRL+C pressed') self.trigger_stop() def trigger_stop(self): """ Sets terminal conditions to true :return: """ self.terminate = True self.stop_event.set() def trigger_quit(self): """ Terminal condition & file change :return: """ self.trigger_stop() utils.try_touch('.github-quit') # # CMD handlers # def do_quit(self, arg): self.trigger_quit() logger.info('Waiting for thread termination') time.sleep(1) logger.info('Quitting') return Cmd.do_quit(self, arg) def do_reset(self, line): print('\033c') def do_gc(self, line): gc.collect() def do_mem_top(self, line): print(mem_top.mem_top()) def do_mem_track_init(self, line): self.mem_tracker = SummaryTracker() def do_mem_track_diff(self, line): print(self.mem_tracker.print_diff()) def do_mem_track_deinit(self, line): self.mem_tracker = None def do_mem(self, line): print('Memory usage: %s kB' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) def do_state(self, line): js = self.state_gen() if line is None or len(line) == 0: del js['link_queue'] del js['resource_stats'] elif line == '1': del js['link_queue'] print(json.dumps(js, indent=2)) def do_deq_enable(self, line): self.new_keys_events.disabled = False self.new_users_events.disabled = False def do_deq_disable(self, line): self.new_keys_events.disabled = True self.new_users_events.disabled = True # # Init # def init_config(self): """ Loads config & state files :return: """ if self.state_file_path is not None and os.path.exists( self.state_file_path): with open(self.state_file_path, 'r') as fh: self.state = json.load(fh, object_pairs_hook=OrderedDict) logger.info('State loaded: %s' % os.path.abspath(self.state_file_path)) with open(self.config_file, 'r') as fh: self.config = json.load(fh, object_pairs_hook=OrderedDict) logger.info('Config loaded: %s' % os.path.abspath(self.config_file)) if 'since_id' in self.config: self.since_id = self.config['since_id'] # Process resources - randomized if 'res' in self.config: res_tmp = self.config['res'] random.shuffle(res_tmp) for idx, res in enumerate(res_tmp): r = AccessResource(usr=res['usr'], token=res['token'], idx=idx) self.resources_list.append(r) self.resources_queue.put(r) logger.info('Resource %02d loaded: %s' % (idx, r.usr)) else: # unauth r = AccessResource(usr=None, token=None) self.resources_list.append(r) self.resources_queue.put(r) def init_db(self): """ Initializes database engine & session. Has to be done on main thread. :return: """ self.db_config = databaseutils.process_db_config(self.config['db']) from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, scoped_session self.engine = create_engine(self.db_config.constr, pool_recycle=3600) self.session = scoped_session(sessionmaker(bind=self.engine)) # Make sure tables are created DB_Base.metadata.create_all(self.engine) def init_workers(self): """ Initialize worker threads :return: """ logger.info('Starting %d working threads' % self.threads) for idx in range(self.threads): t = threading.Thread(target=self.work_thread_main, args=(idx, )) self.worker_threads.append(t) # Kick-off all threads for t in self.worker_threads: t.start() logger.info('Worker threads started') def cli(self): """ CLI thread :return: """ logger.info('CLI thread started') self.cmdloop() logger.info('Terminating CLI thread') # # Operation # def work(self): """ Main thread work method :return: """ # Interrupt signals signal.signal(signal.SIGINT, self.signal_handler) self.init_config() self.init_db() # Resume last state self.state_resume() # Monitor threads. self.state_thread = threading.Thread(target=self.state_main, args=()) self.state_thread.start() # If there is no link to process - create from since. if self.link_queue.qsize() == 0: self.kickoff_links() # Worker threads self.init_workers() logger.info('Main thread started %s %s %s' % (os.getpid(), os.getppid(), threading.current_thread())) # CLI thread cli_thread = threading.Thread(target=self.cli, args=()) cli_thread.setDaemon(True) cli_thread.start() # Join on workers self.after_loop() logger.info('Terminating main thread') return None def after_loop(self, wait_for_state=True): """ After work loop finishes :return: """ logger.info('Waiting termination of slave threads') # Wait here for termination of all workers and monitors. try: for t in self.worker_threads: t.join() if wait_for_state: self.state_thread.join() except: logger.error('Exception during thread join') logger.error(traceback.format_exc()) logger.info('All threads terminates, last state save') self.state_save() def work_thread_main(self, idx): """ Worker thread main loop :return: """ self.local_data.idx = idx logger.info('Working thread %d started' % idx) while not self.terminate and not self.stop_event.is_set(): self.interruptible_sleep_delta(0.1) # Get credential to process link with resource = self.resource_allocate() if resource is None: continue # We have resource, now get the job job = None try: job = self.link_queue.get(True, timeout=1.0) except Queue.Empty: self.resource_return(resource) continue # If job last fail is too recent - put again back to queue if time.time() - job.last_fail < 3.0: self.link_queue.put( job ) # re-insert to the back of the queue for later processing self.resource_return(resource) continue # Job processing starts here - fetch data page with the resource. js_data = None try: self.local_data.job = job self.local_data.resource = resource js_data, headers, raw_response = self.load_page_local() except RateLimitHit as e: logger.error( '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s' % (idx, job.url, job.fail_cnt, resource.usr, e)) continue except Exception as e: logger.error( '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s' % (idx, job.url, job.fail_cnt, resource.usr, e)) self.on_job_failed(job) continue finally: self.resource_return(resource) self.local_data.resource = None self.local_data.last_usr = resource.usr self.local_data.last_remaining = resource.remaining # Process downloaded data here. try: if js_data is None: self.audit_log('404', job.url, jtype=job.type, job=job) self.flush_audit() continue if job.type == DownloadJob.TYPE_USERS: self.process_users_data(job, js_data, headers, raw_response) else: self.process_keys_data(job, js_data, headers, raw_response) except Exception as e: logger.error( '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s' % (idx, job.type, job.url, job.fail_cnt, resource.usr, e)) self.trace_logger.log(e) self.on_job_failed(job) finally: self.local_data.resource = None self.local_data.job = None self.local_data.last_usr = None self.local_data.last_remaining = None resource = None job = None headers = None raw_response = None pass logger.info('Terminating worker thread %d' % idx) def on_job_failed(self, job): """ If job failed, this teaches it how to behave :param job: :return: """ job.fail_cnt += 1 job.last_fail = time.time() # if failed too many times - log and discard. if job.fail_cnt > 35: logger.warning('Job failed too many times %s' % job.url) self.audit_log('too-many-fails', job.url, jtype=job.type, job=job) self.flush_audit() else: self.link_queue.put( job) # re-insert to the queue for later processing def load_page_local(self): """ Loads page stored in thread local :return: """ auth = None resource = self.local_data.resource if resource.usr is not None: auth = HTTPBasicAuth(resource.usr, resource.token) job = self.local_data.job res = requests.get(job.url, timeout=10, auth=auth) headers = res.headers resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset')) resource.remaining = utils.try_int( headers.get('X-RateLimit-Remaining')) resource.last_used = time.time() resource.used_cnt += 1 if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10: resource.fail_cnt += 1 raise RateLimitHit if res.status_code == 404: resource.fail_cnt += 1 logger.warning('URL not found: %s' % job.url) return None, None, None if res.status_code // 100 != 2: resource.fail_cnt += 1 res.raise_for_status() data = res.content if data is None: resource.fail_cnt += 1 raise Exception('Empty response') js = json.loads(data, object_pairs_hook=OrderedDict) return js, headers, res def resource_allocate(self, blocking=True, timeout=1.0): """ Takes resource from the pool. If the resource has low remaining credit, thread is suspended to re-charge. :return: resource or None if not available in the time """ try: resource = self.resources_queue.get(True, timeout=1.0) if resource.remaining is not None and resource.remaining <= self.threads + 2: sleep_sec = resource.reset_time - time.time( ) if resource.reset_time is not None else 15 * 60 sleep_sec += 120 # extra 2 minutes to avoid problems with resources logger.info( 'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, ' '%d minutes' % (resource.usr, resource.remaining, resource.reset_time, sleep_sec, sleep_sec / 60.0)) self.sleep_interruptible(time.time() + sleep_sec) logger.info('Resource sleep finished %s' % resource.usr) # Reset estimations, needs to be refreshed resource.remaining = None resource.reset_time = None return resource except Queue.Empty: return None def resource_return(self, res): """ Returns resource to the pool :param res: :return: """ self.resources_queue.put(res) def sleep_interruptible(self, until_time): """ Interruptible sleep - sleep until given time. :param until_time: :return: """ while time.time() <= until_time: time.sleep(1.0) if self.terminate or self.stop_event.is_set(): return def interruptible_sleep_delta(self, sleep_time): """ Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread. :param sleep_time: :return: """ if sleep_time is None: return sleep_time = float(sleep_time) if sleep_time == 0: return sleep_start = time.time() while not self.stop_event.is_set() and not self.terminate: time.sleep(0.1) if time.time() - sleep_start >= sleep_time: return # # Parser and processing logic # def kickoff_links(self): """ Kick off the scrapping by adding initial links to the queue :return: """ if self.update_keys: self.fill_user_key_links() else: job = DownloadJob(url=self.USERS_URL % self.since_id, jtype=DownloadJob.TYPE_USERS) self.link_queue.put(job) logger.info('Kickoff link added: %s' % job.url) def process_users_data(self, job, js, headers, raw_response): """ Process user data - produce keys links + next user link :param job: :param js: :param headers: :param raw_response: :return: """ max_id = 0 github_users = [] cur_time = int(time.time()) for user in js: if 'id' not in user: logger.error('Field ID not found in user') continue github_user = GitHubUser(user_id=int(user['id']), user_name=user['login'], user_type=user['type'], user_url=user['url']) github_users.append(github_user) if github_user.user_id > max_id: max_id = github_user.user_id if self.users_only: continue key_url = '%s/keys' % github_user.user_url new_job = DownloadJob(url=key_url, jtype=DownloadJob.TYPE_KEYS, user=github_user, priority=random.randint(0, 1000), time_added=cur_time) self.link_queue.put(new_job) # Link with the maximal user id users_url = self.USERS_URL % max_id new_job = DownloadJob(url=users_url, jtype=DownloadJob.TYPE_USERS, time_added=cur_time) # Optimizing the position of this link in the link queue queue_size = self.link_queue.qsize() queue_size_max = self.LINK_FACTOR * self.threads fill_up_ratio = queue_size / float(queue_size_max) # Key jobs are uniformly distributed on priorities 0...1000. # To increase queue size pick priority closer to 1000, do decrease, closer to 0 priority = random.randint(0, 500) if queue_size < queue_size_max: priority = int((1 - fill_up_ratio) * 5000) + 500 if queue_size > 3 * queue_size_max: priority = 0 new_job.priority = priority lucky_one = False with self.user_lock: if self.since_id < max_id: self.since_id = max_id self.link_queue.put(new_job) lucky_one = True # Bulk user optimisation - add more users at once, multithreading if self.users_only: self.bulk_user_only_load(max_id=max_id, cur_time=cur_time, priority=priority) logger.info( '[%02d, usr=%20s, remaining=%5s] Processed users link %s, Next since: %3s. ResQSize: %4d, ' 'LQSize: %4d, fill-up: %0.4f, priority: %4s, ram: %s kB, new=%s, New users: [%s]' % (self.local_data.idx, self.local_data.last_usr, self.local_data.last_remaining, len(github_users) + 1, max_id, self.resources_queue.qsize(), queue_size, fill_up_ratio, priority, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, lucky_one, ', '.join([str(x.user_name) for x in github_users]))) # Store all users. self.store_users_list(github_users) def bulk_user_only_load(self, max_id, cur_time, priority): """ Fills the queue with multiple user links :param max_id: :param cur_time: :param priority: :return: """ new_jobs = [] for page_idx in range(1, self.users_bulk_load_pages + 1): users_url = self.USERS_URL % (max_id + self.users_per_page * page_idx) new_job = DownloadJob(url=users_url, jtype=DownloadJob.TYPE_USERS, time_added=cur_time, priority=priority - page_idx) new_jobs.append(new_job) self.since_id = max_id + self.users_per_page * self.users_bulk_load_pages logger.info('[%02d] Bulk load, new max=%s' % (self.local_data.idx, self.since_id)) for job in new_jobs: self.link_queue.put(job) def store_users_list(self, users): """ Stores all user in the list :param users :return: """ # Handling gaps in the user space ID. With user-only optimization it causes # overlaps. reduced_by = 0 with self.processed_user_set_lock: ids = [user.user_id for user in users] ids_ok = [] for id in ids: if id in self.processed_user_set: reduced_by += 1 continue self.processed_user_set.add(id) ids_ok.append(id) users = [user for user in users if user.user_id in ids_ok] # Bulk user load s = self.session() id_list = sorted([user.user_id for user in users]) db_users = s.query(GitHubUserDb).filter( GitHubUserDb.id.in_(id_list)).all() db_user_map = {user.id: user for user in db_users} for user in users: self.new_users_events.insert() # Store user to the DB try: db_user = utils.defvalkey(db_user_map, key=user.user_id) self.store_user(user, s, db_user=db_user, db_user_loaded=True) except Exception as e: logger.warning('[%02d] Exception in storing user %s' % (self.local_data.idx, e)) self.trace_logger.log(e) logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list)) self.trigger_quit() break try: s.commit() # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by)) except Exception as e: logger.warning('[%02d] Exception in storing bulk users' % self.local_data.idx) logger.warning(traceback.format_exc()) logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list)) self.trigger_quit() finally: utils.silent_close(s) def process_keys_data(self, job, js, headers, raw_response): """ Processing key loaded data :param job: :param js: :param headers: :param raw_response: :return: """ js_empty = js is None or len(js) == 0 # Expect failures, commit everything before if self.merge and not js_empty: try: s = self.session() s.commit() except Exception as e: logger.warning('Could not pre-commit: %s' % e) # Store each key. key_ids = [] for key in js: s = None self.new_keys_events.insert() try: s = self.session() self.store_key(job.user, key, s) key_ids.append(int(key['id'])) s.commit() self.assoc_key(job.user.user_id, key['id'], s) s.commit() s.flush() # writes changes to DB s.expunge_all() # removes objects from session except Exception as e: logger.warning('Exception in storing key %s' % e) self.trace_logger.log(e) finally: utils.silent_close(s) s = None # Deassoc lost keys try: s = self.session() self.deassoc_lost_keys(job.user.user_id, key_ids, s) s.commit() except Exception as e: logger.warning('Exception in deassoc for users %s : %s' % (job.user.user_id, e)) self.trace_logger.log(e) finally: utils.silent_close(s) s = None self.on_keys_processed() def on_keys_processed(self): """ Event called on keys have been processed. Currently used to re-fill the link queue in the update key scenario. :return: """ if not self.update_keys: return with self.user_refill_lock: qsize = self.link_queue.qsize() if qsize < 30: self.fill_user_key_links() def store_user(self, user, s, db_user=None, db_user_loaded=False): """ Stores username to the database. :param user: :return: """ type_id = 0 if user.user_type == 'User': type_id = 1 elif user.user_type == 'Organization': type_id = 2 try: if not db_user_loaded: db_user = s.query(GitHubUserDb).filter( GitHubUserDb.id == user.user_id).one_or_none() if db_user is not None: db_user.date_last_check = salch.func.now() db_user.usr_type = type_id s.merge(db_user) return 0 except Exception as e: self.trace_logger.log(e) logger.warning('User query problem: %s' % e) # Store a new user here try: db_user = GitHubUserDb() db_user.id = user.user_id db_user.username = user.user_name db_user.usr_type = type_id s.add(db_user) return 0 except Exception as e: self.trace_logger.log(e) logger.warning('[%02d] Exception during user store: %s' % (self.local_data.idx, e)) if db_user_loaded: raise return 1 def load_existing_key(self, key, s): """ Loads existing key if exists :param key: :param s: :return: """ key_id = int(key['id']) return s.query(GitHubKey).filter(GitHubKey.id == key_id).one_or_none() def store_key(self, user, key, s): """ Stores user key to the database. :param user: :param key: :param s: current DB session :return: """ # Loading phase existing_key = None try: if self.merge or self.update_keys: existing_key = self.load_existing_key(key, s) except Exception as e: logger.warning('Exception: %s' % e) # Storing phase try: if existing_key is not None: existing_key.date_last_check = salch.func.now() s.merge(existing_key) return existing_key key_id = int(key['id']) key_raw = key['key'] key_type, key_val = [utils.strip(x) for x in key_raw.split(' ', 1)] db_key = GitHubKey() db_key.id = key_id db_key.key_id = key_id db_key.key_type = key_type db_key.key_user_found = user.user_name db_key.key_user_id_found = user.user_id db_key.text_raw = key_raw if key_type == 'ssh-rsa': try: key_obj = utils.load_ssh_pubkey(key_raw) if isinstance(key_obj, RSAPublicKey): db_key.key_size = key_obj.key_size numbers = key_obj.public_numbers() db_key.key_modulus_hex = '%x' % numbers.n db_key.key_exponent = numbers.e except Exception as e: logger.info('Exception during processing the key[%s]: %s' % (key_type, e)) s.add(db_key) return db_key except Exception as e: utils.silent_rollback(s) logger.warning('Exception during key store: %s' % e) return 1 def assoc_key(self, user_id, key_id, s): """ Association user <-> key :param user_id: :param key_id: :param s: :return: """ try: uassoc = None if self.merge: uassoc = s.query(GitHubUserKeys)\ .filter(GitHubUserKeys.user_id == user_id)\ .filter(GitHubUserKeys.key_id == key_id)\ .one_or_none() if uassoc is not None: uassoc.lost_at = None return if uassoc is None: uassoc = GitHubUserKeys() uassoc.user_id = user_id uassoc.key_id = key_id uassoc.fount_at = salch.func.now() uassoc.lost_at = None s.add(uassoc) return 0 except Exception as e: utils.silent_rollback(s) logger.warning('Exception during key assoc: %s' % e) return 1 def deassoc_lost_keys(self, user_id, key_id, s): """ Mark all lost keys as lost for the given user (not in key_id) :param user_id: :param key_id: :param s: :return: """ try: q = s.query(GitHubUserKeys)\ .filter(GitHubUserKeys.user_id == user_id) if len(key_id) > 0: q = q.filter(GitHubUserKeys.key_id.notin_(key_id)) q.update({GitHubUserKeys.lost_at: salch.func.now()}, synchronize_session='fetch') return 0 except Exception as e: utils.silent_rollback(s) logger.warning('Exception during key deassoc: %s' % e) return 1 def flush_state(self): """ Flushes state/config to the state file :return: """ self.state['since_id'] = self.since_id self.state['rate_limit_remaining'] = self.rate_limit_remaining self.state['rate_limit_reset'] = self.rate_limit_reset utils.flush_json(self.state, self.state_file_path) # # DB # def fill_user_key_links(self): """ Loads next X users from the database, advances since_id :return: """ # self.since_id s = self.session() try: db_users = s.query(GitHubUserDb)\ .filter(GitHubUserDb.id > self.since_id)\ .order_by(GitHubUserDb.id)\ .limit(self.user_load_bulk)\ .all() for user in db_users: key_url = self.KEYS_ID_URL % user.id github_user = GitHubUser(user_id=user.id, user_name=user.username, user_type=user.usr_type, user_url=self.USER_URL % user.username) new_job = DownloadJob(url=key_url, jtype=DownloadJob.TYPE_KEYS, user=github_user, priority=random.randint(0, 1000), time_added=time.time()) self.link_queue.put(new_job) if user.id > self.since_id: self.since_id = user.id except Exception as e: logger.warning('Exception in loading users: %s' % e) utils.silent_rollback(s) finally: utils.silent_close(s) # # Auditing - errors, problems for further analysis # def audit_log(self, evt=None, link=None, jtype=None, job=None): """ Appends audit log to the buffer. Lock protected. :param evt: :param link: :return: """ log = collections.OrderedDict() log['time'] = time.time() log['evt'] = evt log['jtype'] = jtype log['link'] = link if job is not None and isinstance(job, DownloadJob): log['job'] = job.to_json() with self.audit_lock: self.audit_records_buffered.append(log) def flush_audit(self): """ Flushes audit logs to the JSON append only file. Routine protected by the lock (no new audit record can be inserted while holding the lock) :return: """ if self.audit_file is None: self.audit_records_buffered = [] return self.audit_lock.acquire() try: if len(self.audit_records_buffered) == 0: return with open(self.audit_file, 'a') as fa: for x in self.audit_records_buffered: fa.write(json.dumps(x) + "\n") self.audit_records_buffered = [] except Exception as e: logger.error('Exception in audit log dump %s' % e) finally: self.audit_lock.release() # # State save / resume # def state_main(self): """ State thread - periodical dump of the queues. :return: """ logger.info('State thread started %s %s %s' % (os.getpid(), os.getppid(), threading.current_thread())) try: while not self.stop_event.is_set() and not self.terminate: # Dump stats each x seconds # Sleep is here because of dumping the state for the last time just before program quits. self.interruptible_sleep_delta(2) self.state_save() # Check memory conditions self.state_ram_check() except Exception as e: self.trace_logger.log(e) logger.error('Exception in state: %s' % e) finally: pass logger.info('State loop terminated') def state_ram_check(self): """ Checks memory terminating conditions :return: """ if self.max_mem is None: return cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if cur_ram <= self.max_mem: return logger.warning( 'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB' % (cur_ram, cur_ram / 1024.0, self.max_mem)) self.trigger_stop() def state_gen(self): """ Dumps state :return: """ try: js_q = collections.OrderedDict() js_q['gen'] = time.time() js_q['link_size'] = self.link_queue.qsize() js_q['since_id'] = self.since_id js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # Dequeues self.new_users_events.maintain() self.new_keys_events.maintain() users_in_5min = self.new_users_events.under_limit(5 * 60) keys_in_5min = self.new_keys_events.under_limit(5 * 60) js_q['users_dequeue_size'] = self.new_users_events.len() js_q['keys_dequeue_size'] = self.new_keys_events.len() js_q['users_5min'] = users_in_5min js_q['keys_5min'] = keys_in_5min js_q['users_1min'] = users_in_5min / 5.0 js_q['keys_1min'] = keys_in_5min / 5.0 # link queue structure qdata = list(self.link_queue.queue) qdata.sort(cmp=DownloadJob.cmp) js_q['link_structure'] = ''.join([ '.' if x.type == DownloadJob.TYPE_KEYS else 'U' for x in qdata ]) # Stats. js_q['resource_stats'] = [ x.to_json() for x in list(self.resources_list) ] # Finally - the queue js_q['link_queue'] = [x.to_json() for x in qdata] return js_q except Exception as e: self.trace_logger.log(e) logger.error('Exception in state: %s', e) def state_save(self): """ saves the state :return: """ try: js_q = self.state_gen() utils.flush_json(js_q, self.state_file_path) except Exception as e: self.trace_logger.log(e) logger.error('Exception in state: %s', e) def state_resume(self): """ Attempts to resume the queues from the monitoring files :return: """ try: if self.state is None: return if 'since_id' in self.state: self.since_id = self.state['since_id'] if 'link_queue' in self.state: for rec in self.state['link_queue']: job = DownloadJob.from_json(rec) self.link_queue.put(job) logger.info('Link queue resumed, entries: %d' % len(self.state['link_queue'])) except Exception as e: self.trace_logger.log(e) logger.warning('Exception in resuming the state %s' % e) logger.error('State resume failed, exiting') sys.exit(1)
def test_functionality(): n_tests_passed = 0 round = 0 while round < 50: # multiple runs required to check for memory leaks if round == 0: tracker = SummaryTracker() round += 1 a = GenericSystem.new() b = a.Plant # Test setting values with correct types a.Plant.derate = 1 assert (a.Plant.derate == 1) assert (b.derate == 1) n_tests_passed += 1 b.energy_output_array = (1, 2) assert (a.Plant.energy_output_array == (1, 2)) assert (b.energy_output_array == (1, 2)) n_tests_passed += 1 # Test type checks with errors try: c = GenericSystem.new() c.Plant.energy_output_array = 1 except: n_tests_passed += 1 try: c = GenericSystem.new() c.Plant.energy_output_array = (1, "2") except: n_tests_passed += 1 # Test assigning from dictionary PlantDict = {'derate': 10, 'energy_output_array': (10, 20)} b.assign(PlantDict) assert (b.derate == 10) assert (b.energy_output_array == (10, 20)) n_tests_passed += 1 PlantDict = {'derate': 1, 'energy_output_array': (2, 2)} a.Plant.assign(PlantDict) assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2)) n_tests_passed += 1 PlantDict = {'derate': 10, 'energy_output_array': (10, 20)} try: c = GenericSystem.new() PlantDict['energy_output_array'] = () c.Plant.assign(PlantDict) except: n_tests_passed += 1 try: c = GenericSystem.new() PlantDict['energy_output_array'] = ((12, 20), (1, 1)) c.Plant.assign(PlantDict) except: n_tests_passed += 1 try: c = GenericSystem.new() PlantDict['derate'] = "derate" PlantDict['energy_output_array'] = (1, 2) c.Plant.assign(PlantDict) except: n_tests_passed += 1 # exporting to dictionary ValDict = b.export() assert (ValDict['derate'] == 1 and ValDict['energy_output_array'] == (2, 2)) n_tests_passed += 1 # Test shared module (AdjustmentFactors) d = a.AdjustmentFactors d.constant = 1 assert (d.constant == 1) n_tests_passed += 1 d.hourly = (1, 2) assert (d.hourly == (1, 2)) n_tests_passed += 1 d.periods = ((1, 2), (3, 4)) assert (d.periods == ((1, 2), (3, 4))) n_tests_passed += 1 try: d.periods = ((1, 2)) except: n_tests_passed += 1 ValDict = d.export() assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2) and ValDict['periods'] == ((1, 2), (3, 4))) n_tests_passed += 1 ValDict = { 'constant': 10, "hourly": (10, 20), "periods": ((10, 20), (30, 40)) } d.assign(ValDict) assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20) and ValDict['periods'] == ((10, 20), (30, 40))) n_tests_passed += 1 # Test nested dictionary assignment and export TechDict = { 'Plant': { 'derate': 100, 'energy_output_array': (100, 200) }, 'AdjustmentFactors': { 'constant': 100, "hourly": (100, 200), "periods": ((100, 200), (300, 400)) } } a.assign(TechDict) ValDict = a.Plant.export() assert (ValDict['derate'] == 100 and ValDict['energy_output_array'] == (100, 200)) n_tests_passed += 1 ValDict = a.AdjustmentFactors.export() assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200) and ValDict['periods'] == ((100, 200), (300, 400))) n_tests_passed += 1 # Test reading from PySSC data = ssc.data_create() ssc.data_set_number(data, b'derate', 1000) ssc.data_set_array(data, b'energy_output_array', [1000, 2000]) a = GenericSystem.wrap(data) assert (a.Plant.derate == 1000) assert (a.Plant.energy_output_array == (1000, 2000)) # Test strings and tables with error cases import PySAM.Pvwattsv5 as Pvwattsv5 a = Pvwattsv5.new() a.SolarResource.solar_resource_file = "file" assert (a.SolarResource.solar_resource_file == "file") n_tests_passed += 1 assert (a.SolarResource.export()['solar_resource_file'] == 'file') n_tests_passed += 1 c = Pvwattsv5.new() datDict = { 'num': 1, 'arr': (1, 2), 'str': 'str', 'mat': ((1, 2), (3, 4)), 'table': { 'yo': 0 } } c.SolarResource.solar_resource_data = datDict DataDict = c.SolarResource.solar_resource_data assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2)) assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0))) assert (DataDict['str'] == 'str') assert (DataDict['table'] == {'yo': 0}) n_tests_passed += 1 try: c = Pvwattsv5.new() c.SolarResource.solar_resource_file = 100 except: n_tests_passed += 1 try: c = Pvwattsv5.new() c.SolarResource.solar_resource_data = { 'num': 1, 'arr': (1, "2"), 'mat': ((1, 2), (3, 4)), 'str': 'str', 'table': { 'yo': 0 } } except: n_tests_passed += 1 try: c = Pvwattsv5.new() c.SolarResource.solar_resource_data = { 'num': 1, 'arr': (1, 2), 'mat': (("1", 2), (3, 4)), 'str': 'str', 'table': { 'yo': 0 } } except: n_tests_passed += 1 a.SolarResource.solar_resource_data = { 'num': 1, 'arr': (1, 2), 'mat': ((1, 2), (3, 4)), 'str': 'str', 'table': {} } assert (a.SolarResource.solar_resource_data['table'] == {}) n_tests_passed += 1 # Test conversion between technology attributes and nested dictionary genDict = a.export() assert ( genDict['SolarResource']['solar_resource_data']['str'] == 'str') n_tests_passed += 1 a = Pvwattsv5.new() assert (a.export()['SolarResource'] == {}) a.assign(genDict) assert (a.export() == genDict) n_tests_passed += 1 # Test loading from serialized dict a = GenericSystem.default("GenericSystemNone") # Test `value` function a.value("derate", 1) assert (a.value("derate") == 1) a.value("energy_output_array", (0, 1, 2)) assert (a.value("energy_output_array")[0] == 0) assert (a.value("energy_output_array")[1] == 1) assert (a.value("energy_output_array")[2] == 2) if round == 3: tracker.print_diff() tracker.print_diff()
def test_functionality(): n_tests_passed = 0 round = 0 while round < 50: # multiple runs required to check for memory leaks if round == 0: tracker = SummaryTracker() print("\n===Starting Round", round, "===\n\n") round += 1 a = GenericSystem.new() b = a.Plant # Test setting values with correct types a.Plant.derate = 1 assert (a.Plant.derate == 1) assert (b.derate == 1) print("Passed test", 0) n_tests_passed += 1 b.energy_output_array = (1, 2) assert (a.Plant.energy_output_array == (1, 2)) assert (b.energy_output_array == (1, 2)) print("Passed test", 1) n_tests_passed += 1 # Test type checks with errors try: c = GenericSystem.new() c.Plant.energy_output_array = 1 print("FAIL 2: exception is expected") except: print("Passed test", 2) n_tests_passed += 1 try: c = GenericSystem.new() c.Plant.energy_output_array = (1, "2") print("FAIL 3: exception is expected") except: print("Passed test", 3) n_tests_passed += 1 # Test assigning from dictionary PlantDict = {'derate': 10, 'energy_output_array': (10, 20)} b.assign(PlantDict) assert (b.derate == 10) assert (b.energy_output_array == (10, 20)) print("Passed test", 4) n_tests_passed += 1 PlantDict = {'derate': 1, 'energy_output_array': (2, 2)} a.Plant.assign(PlantDict) assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2)) print("Passed test", 5) n_tests_passed += 1 PlantDict = {'derate': 10, 'energy_output_array': (10, 20)} try: c = GenericSystem.new() PlantDict['energy_output_array'] = () c.Plant.assign(PlantDict) print("FAIL 1: exception is expected") except: print("Error caught", 1) n_tests_passed += 1 try: c = GenericSystem.new() PlantDict['energy_output_array'] = ((12, 20), (1, 1)) c.Plant.assign(PlantDict) print("FAIL 2: exception is expected") except: print("Error caught", 2) n_tests_passed += 1 try: c = GenericSystem.new() PlantDict['derate'] = "derate" PlantDict['energy_output_array'] = (1, 2) c.Plant.assign(PlantDict) print("FAIL 3: exception is expected") except: print("Error caught", 3) n_tests_passed += 1 # exporting to dictionary ValDict = b.export() assert (ValDict['derate'] == 1 and ValDict['energy_output_array'] == (2, 2)) print("Passed test", 6) n_tests_passed += 1 # Test shared module (AdjustmentFactors) d = a.AdjustmentFactors d.constant = 1 assert (d.constant == 1) print("Passed test", 7) n_tests_passed += 1 d.hourly = (1, 2) assert (d.hourly == (1, 2)) print("Passed test", 8) n_tests_passed += 1 d.periods = ((1, 2), (3, 4)) assert (d.periods == ((1, 2), (3, 4))) print("Passed test", 9) n_tests_passed += 1 try: d.periods = ((1, 2)) print("FAIL 4: exception is expected") except: print("Error caught", 4) n_tests_passed += 1 ValDict = d.export() assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2) and ValDict['periods'] == ((1, 2), (3, 4))) print("Passed test", 10) n_tests_passed += 1 ValDict = { 'constant': 10, "hourly": (10, 20), "periods": ((10, 20), (30, 40)) } d.assign(ValDict) assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20) and ValDict['periods'] == ((10, 20), (30, 40))) print("Passed test", 11) n_tests_passed += 1 # Test nested dictionary assignment and export TechDict = { 'Plant': { 'derate': 100, 'energy_output_array': (100, 200) }, 'AdjustmentFactors': { 'constant': 100, "hourly": (100, 200), "periods": ((100, 200), (300, 400)) } } a.assign(TechDict) ValDict = a.Plant.export() assert (ValDict['derate'] == 100 and ValDict['energy_output_array'] == (100, 200)) print("Passed test", 12) n_tests_passed += 1 ValDict = a.AdjustmentFactors.export() assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200) and ValDict['periods'] == ((100, 200), (300, 400))) print("Passed test", 13) n_tests_passed += 1 # Test reading from PySSC data = ssc.data_create() ssc.data_set_number(data, b'derate', 1000) ssc.data_set_array(data, b'energy_output_array', [1000, 2000]) a = GenericSystem.wrap(data) assert (a.Plant.derate == 1000) assert (a.Plant.energy_output_array == (1000, 2000)) # Test strings and tables with error cases import PySAM.Pvwattsv5 as Pvwattsv5 a = Pvwattsv5.new() a.LocationAndResource.solar_resource_file = "file" assert (a.LocationAndResource.solar_resource_file == "file") print("Passed test", 14) n_tests_passed += 1 assert ( a.LocationAndResource.export()['solar_resource_file'] == 'file') print("Passed test", 15) n_tests_passed += 1 c = Pvwattsv5.new() datDict = { 'num': 1, 'arr': (1, 2), 'str': 'str', 'mat': ((1, 2), (3, 4)), 'table': { 'yo': 0 } } c.LocationAndResource.solar_resource_data = datDict DataDict = c.LocationAndResource.solar_resource_data assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2)) assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0))) assert (DataDict['str'] == 'str') assert (DataDict['table'] == {'yo': 0}) print("Passed test", 16) n_tests_passed += 1 try: c = Pvwattsv5.new() c.LocationAndResource.solar_resource_file = 100 print("FAIL 5: exception is expected") except: print("Error caught", 5) n_tests_passed += 1 try: c = Pvwattsv5.new() c.LocationAndResource.solar_resource_data = { 'num': 1, 'arr': (1, "2"), 'mat': ((1, 2), (3, 4)), 'str': 'str', 'table': { 'yo': 0 } } print("FAIL 6: exception is expected") except: print("Error caught", 6) n_tests_passed += 1 try: c = Pvwattsv5.new() c.LocationAndResource.solar_resource_data = { 'num': 1, 'arr': (1, 2), 'mat': (("1", 2), (3, 4)), 'str': 'str', 'table': { 'yo': 0 } } print("FAIL 7: exception is expected") except: print("Error caught", 7) n_tests_passed += 1 a.LocationAndResource.solar_resource_data = { 'num': 1, 'arr': (1, 2), 'mat': ((1, 2), (3, 4)), 'str': 'str', 'table': {} } assert (a.LocationAndResource.solar_resource_data['table'] == {}) print("Passed test", 17) n_tests_passed += 1 # Test conversion between technology attributes and nested dictionary genDict = a.export() assert (genDict['LocationAndResource']['solar_resource_data']['str'] == 'str') print("Passed test", 18) n_tests_passed += 1 a = Pvwattsv5.new() assert (a.export()['LocationAndResource'] == {}) a.assign(genDict) assert (a.export() == genDict) print("Passed test", 19) n_tests_passed += 1 # Test loading from serialized dict a = GenericSystem.default("GenericSystemNone") print(a.export()) if round == 3: tracker.print_diff() # execution # a.execute(1) tracker.print_diff()
def eRisk_testT2_runs_2(writingDir, DecisionDir): PATH = Path(writingDir) CLAS_PATH = 'data/nlp_clas/eRisk_dep/eRisk_dep_clas' CLASSES = ['neg', 'pos'] lm_id = 'eRisk_dep4_FT' clas_id = 'eRisk_dep4_2_BEST' LM_PATH = "data/nlp_clas/eRisk_dep/eRisk_dep_lm" val_texts, subjIDs, chunkIDs, wrIDs = get_texts_final(PATH) df_val_ALL = pd.DataFrame({ 'text': val_texts, 'chunk_No': list(map(int, chunkIDs)), 'wr_No': list(map(int, wrIDs)), 'subj_ID': subjIDs }) df_val_ALL = df_val_ALL.sort_values(by=['subj_ID', 'chunk_No', 'wr_No'], ascending=[True, True, True]) subjIDs_unq = set(subjIDs) df_val_ALL_proc = df_val_ALL[0:0] for sId in subjIDs_unq: #print(sId) df_subj = df_val_ALL.loc[df_val_ALL['subj_ID'] == sId] df_subj_proc = ProcessAcc(df_subj, 5) df_val_ALL_proc = df_val_ALL_proc.append(df_subj_proc) print('Finish [ProcessAcc].....' + str(max(map(int, wrIDs)))) val_texts = [ it[0] for it in df_val_ALL_proc.loc[:, ['text']].values.tolist() ] #val_labels = [it[0] for it in df_val_ALL_proc.loc[:, ['labels']].values.tolist()] subjIDs = [ it[0] for it in df_val_ALL_proc.loc[:, ['subj_ID']].values.tolist() ] chunkIDs = [ it[0] for it in df_val_ALL_proc.loc[:, ['chunk_No']].values.tolist() ] wrIDs = [it[0] for it in df_val_ALL_proc.loc[:, ['wr_No']].values.tolist()] #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy', val_texts) ##np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy', val_labels) #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy', subjIDs) #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy', chunkIDs) #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy', wrIDs) # #val_texts = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy') ##val_labels = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy') #subjIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy') #chunkIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy') #wrIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy') # col_names = ['labels', 'text'] df_val = pd.DataFrame({ 0: np.zeros(len(val_texts)), 1: val_texts }) # columns=col_names) #df_val = pd.DataFrame({'text':val_texts, 'labels':np.zeros(len(val_texts))}, columns=col_names) #writeToCSV_oneFiled(df_val, CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/test_Real1_wr.csv') # # ##chunksize = 24000 #chunksize = 24000 # # ##df_val1 = pd.read_csv(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'test_Real1_wr.csv', header=None, chunksize=chunksize, engine='python')#,names=(range(227))) #tok_val, val_labels = get_all_eRisk(df_val1, 1) gc.collect() tracker = SummaryTracker() tracker.print_diff() tok_val, val_labels = get_texts_eRisk(df_val, 1) tracker = SummaryTracker() tracker.print_diff() #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy', tok_val) #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'lbl_val.npy', val_labels) #tok_val = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy') itos = pickle.load(open(LM_PATH + '/' + 'tmp' + '/' + 'itos.pkl', 'rb')) stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) len(itos) val_clas = np.array([[stoi[o] for o in p] for p in tok_val]) np.save( CLAS_PATH + '/' + 'tmp' + '/' + 'tests' + '/' + 'testT2' + '/' + 'val_ids.npy', val_clas) #create_bw_data_test(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'testT2'+'/', LM_PATH+'/'+'tmp'+'/'+'itos.pkl') print('Start running the models .... ') prediction, samplIdx, pos_prob = eval_clas_eRisk_final_3(str(CLAS_PATH), 2, lm_id=lm_id, clas_id=clas_id, attention=False) #prediction_bw, samplIdx_bw, pos_prob_bw = eval_clas_eRisk_final_3(str(CLAS_PATH), 2, lm_id=lm_id, clas_id=clas_id, attention=False, backwards=True) #prediction = [1 if (it_fw+it_bw)/2 > 0.5 else 0 for it_fw, it_bw in zip(prediction_fw, prediction_bw)] #samplIdx = samplIdx_fw #pos_prob = [(it_fw+it_bw)/2 for it_fw, it_bw in zip(pos_prob_fw, pos_prob_bw)] tracker = SummaryTracker() tracker.print_diff() #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy', prediction) #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy', samplIdx) #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy', pos_prob) # #prediction = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy') #samplIdx = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy') #pos_prob = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy') # subjIDs = [subjIDs[i] for i in samplIdx] #val_labels = [val_labels[i] for i in samplIdx] chunkIDs = [chunkIDs[i] for i in samplIdx] wrIDs = [wrIDs[i] for i in samplIdx] uniq_subID = list(set(subjIDs)) #results_pred = [[ None ] * 10] * len(uniq_subID) results_pred = [[0] * 10 for _ in range(len(uniq_subID))] results_pred_thr = [[0] * 10 for _ in range(len(uniq_subID))] totalWrs = [[0] * 10 for _ in range(len(uniq_subID))] #results_golden = [ None ] * len(uniq_subID) pos_prob_wr = [[] for _ in range(len(uniq_subID))] #checkIntegrity(subjIDs,val_labels) th = 0.9 for sid, chNum, wr, pred, pos_p in zip(subjIDs, chunkIDs, wrIDs, prediction, pos_prob): idx = uniq_subID.index(sid) pos_prob_wr[idx].append(pos_p) results_pred[idx][int(chNum) - 1] = results_pred[idx][int(chNum) - 1] + pred if (pos_p >= th): results_pred_thr[idx][int(chNum) - 1] = results_pred_thr[idx][int(chNum) - 1] + 1 totalWrs[idx][int(chNum) - 1] = max(totalWrs[idx][int(chNum) - 1], int(wr)) #res = [[res/total for res,total in zip(results_pred[i], totalWrs[i])] for i in range(len(results_pred))] #print("F1: " + str(f1_erisk(val_labels, prediction))) pos_p_avg = [sum(pb) / len(pb) for pb in pos_prob_wr] #for thr in np.arange(0.0, 1.0, 0.025): # print(str(thr) + ":\t") # prd = [1 if(it >= thr) else 0 for it in pos_p_avg] # #print("F1: " + str(f1_erisk(prd, results_golden))) # f1_erisk(prd, results_golden) # pred_count = [sum(rb) for rb in results_pred_thr] createDecisionFiles(uniq_subID, pred_count, pos_prob_wr, DecisionDir, max(map(int, wrIDs))) gc.collect()
def _start_session(self): if self._debugging_level >= 1: import cProfile import io import pstats pr = cProfile.Profile() pr.enable() if self._debugging_level >= 2: from pympler.tracker import SummaryTracker tracker = SummaryTracker() # GC could cause unplanned stall and disrupts precisely timed collection. # Disable it and do it manually before sleeping. gc.disable() if self._tick_len == 0: self._force_collect = True try: self._logger.info('Client node {node_name} started'.format( node_name=self._node_name)) while not self._stopped: if self._status == ClientStatus.OFFLINE: self._status = ClientStatus.HANDSHAKE1_AUTHENTICATING # Handshake self.timestamp_and_send_list([ ProtocolCode.KEY, self._api_secret_key, self._cluster_name, self._node_name ]) # Reset the timeout counter after sending out a command self._last_received_ts = monotonic_time() self._current_error_msg = 'Failed to connect to the gateway.' self._logger.info( 'Client node {node_name} initiated handshaking. Step 1: authenticating...' .format(node_name=self._node_name)) if self._status == ClientStatus.ALL_OK: if self._getters is not None and len(self._getters) > 0: # We need to collect at a fix time in a second (self._collect_time_decimal) by # the wall clock (so that the wall clock time will be saved to the database for # other analyses), so we don't use monotonic clock (we tried using monotonic # time with some complex calculation in the past but it was too buggy). ts = time.time() # Time backflow detection: because we are not using monotonic time, if the user # turns back the wall time, the program might not do collection in a long time. if self._last_collect_time - ts > self._tick_len > 0: self._logger.info( 'Time was turned backwards. Doing a collection right now.' ) self._last_collect_time = -1 self._update_next_collect_time() # When self._tick_len > 0, we do a collect step at self._next_collect_time. # When self._tick_len == 0, we wait until force_collect. if (self._tick_len > 0 and ts >= self._next_collect_time) or \ (self._tick_len == 0 and self._force_collect): self._logger.debug( 'Time is reached to do collection. Starting...' ) # requested_collect_time will be sent to getters to indicate the time of # data we want to collect. We get it before updating the next_collect_time. requested_collect_time = self._next_collect_time # last_collect_time and next_collect_time must be updated *before* collecting # to prevent skipping of a collection if the collection is longer than a tick_len if self._tick_len > 0: self._last_collect_time = ts self._update_next_collect_time() else: # When tick_len == 0, we have to use a increasing counter instead of # the read collect time to prevent collision. self._last_collect_time += 1 self._force_collect = False pi_data = [] for g in self._getters: self._logger.debug( "Starting to collect data from getter '{getter_name}'" .format(getter_name=g.name)) d = g.collect(self._tick_len, requested_collect_time) if d is None or len(d) == 0: self._logger.warning( "Client node {node_name} getter '{getter_name}' did not return" " any data".format( node_name=self._node_name, getter_name=g.name)) else: self._logger.debug( "Collected data from getter '{getter_name}': {data}" .format(getter_name=g.name, data=str(d))) pi_data.extend(d) if len(pi_data) == 0: self._logger.info( 'Client node {node_name} all getters return no data. Skipped sending.' .format(node_name=self._node_name)) else: self._logger.debug( 'Client node {node_name} collected from all getters: {pi_data}' .format(node_name=self._node_name, pi_data=str(pi_data))) if self._tuning_goal_calculator is None: tuning_goal_payload = [] self._logger.debug( 'Client node {node_name} has no tuning goal' .format(node_name=self._node_name)) else: tuning_goal = self._tuning_goal_calculator.get_tuning_goal( pi_data) assert -1 <= tuning_goal <= 1 tuning_goal_payload = [tuning_goal] self._logger.debug( 'Client node {node_name} collected tuning goal: {tuning_goal}' .format(node_name=self._node_name, tuning_goal=str(tuning_goal))) # First element of the outgoing list is tuning_goal self.timestamp_and_send_list( [ ProtocolCode.PI, tuning_goal_payload + pi_data ], ts=self._last_collect_time) # We don't wait for 'OK' to save time else: self._logger.debug( 'Collection time is not reached yet') pass else: self._logger.debug( 'No getter is set. Skipped collecting.') self._last_collect_time = time.time() gc.collect() flush_log() # Print out memory usage every minute if self._debugging_level >= 2 and int(time.time()) % 60 == 0: print('Time: ' + time.asctime(time.localtime(time.time()))) tracker.print_diff() if self._status == ClientStatus.ALL_OK and self._getters is not None and len(self._getters) > 0 and \ self._tick_len > 0: # Calculate the precise time for next collection sleep_second = max(self._next_collect_time - time.time(), 0) else: sleep_second = 1 # We have to process all messages in the queue before doing next collection, # otherwise, a lengthy collection above could prevent the messages in msg_queue from # being processed. self._process_all_messages(sleep_second) if self.notify_gateway_on_stop: self.timestamp_and_send_list([ProtocolCode.CLIENT_STOP]) self._logger.info('Client node {node_name} stopped'.format( node_name=self._node_name)) finally: self._status = ClientStatus.OFFLINE gc.enable() if self._debugging_level >= 1: pr.disable() s = io.StringIO() sortby = 'cumulative' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) ps.print_stats() print(s.getvalue())
def do_mem_track_init(self, line): self.mem_tracker = SummaryTracker()
class GitHubLoader(Cmd): """ GitHub SSH keys loader """ prompt = '$> ' LINK_FACTOR = 70 USERS_URL = 'https://api.github.com/users?since=%s' KEYS_URL = 'https://api.github.com/users/%s/keys' USER_DETAIL_URL = 'https://api.github.com/users/%s' USER_REPOS_URL = 'https://api.github.com/users/%s/repos' USER_ORGS_URL = 'https://api.github.com/users/%s/orgs' ORG_REPOS_URL = 'https://api.github.com/orgs/%s/repos' ORG_REPO_COLAB_URL = 'https://api.github.com/repos/%s/collaborators' ORG_REPO_ASSIGNEES_URL = 'https://api.github.com/repos/%s/assignees' def __init__(self, attempts=5, threads=1, state=None, state_file=None, config_file=None, audit_file=None, max_mem=None, *args, **kwargs): Cmd.__init__(self, *args, **kwargs) self.t = Terminal() self.args = None self.attempts = int(attempts) self.total = None self.terminate = False self.last_users_count = None self.user_lock = Lock() self.processed_user_set = set() self.processed_user_set_lock = Lock() self.orgs_loaded_set = set() self.orgs_loaded_lock = Lock() self.max_mem = max_mem self.state = state self.state_file_path = state_file self.rate_limit_reset = None self.rate_limit_remaining = None self.config = None self.config_file = config_file self.audit_file = audit_file self.audit_records_buffered = [] self.audit_lock = Lock() self.stop_event = threading.Event() self.threads = int(threads) self.link_queue = Queue.PriorityQueue() # Store links to download here self.worker_threads = [] self.state_thread = None self.state_thread_lock = Lock() self.resources_list = [] self.resources_queue = Queue.PriorityQueue() self.local_data = threading.local() self.new_users_events = EvtDequeue() self.new_keys_events = EvtDequeue() self.db_config = None self.engine = None self.session = None self.mem_tracker = None def signal_handler(self, signal, frame): """ Signal handler - terminate gracefully :param signal: :param frame: :return: """ logger.info('CTRL+C pressed') self.trigger_stop() def trigger_stop(self): """ Sets terminal conditions to true :return: """ self.terminate = True self.stop_event.set() def trigger_quit(self): """ Terminal condition & file change :return: """ self.trigger_stop() utils.try_touch('.github-quit') def do_quit(self, arg): self.trigger_quit() logger.info('Waiting for thread termination') time.sleep(1) logger.info('Quitting') return Cmd.do_quit(self, arg) def do_reset(self, line): print('\033c') def do_gc(self, line): gc.collect() def do_mem_top(self, line): print(mem_top.mem_top()) def do_mem_track_init(self, line): self.mem_tracker = SummaryTracker() def do_mem_track_diff(self, line): print(self.mem_tracker.print_diff()) def do_mem_track_deinit(self, line): self.mem_tracker = None def do_mem(self, line): print('Memory usage: %s kB' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) def do_state(self, line): js = self.state_gen() if line is None or len(line) == 0: del js['link_queue'] del js['resource_stats'] elif line == '1': del js['link_queue'] print(json.dumps(js, indent=2, cls=utils.AutoJSONEncoder)) def do_deq_enable(self, line): self.new_keys_events.disabled = False self.new_users_events.disabled = False def do_deq_disable(self, line): self.new_keys_events.disabled = True self.new_users_events.disabled = True def init_config(self): """ Loads config & state files :return: """ if self.state_file_path is not None and os.path.exists( self.state_file_path): with open(self.state_file_path, 'r') as fh: self.state = json.load(fh, object_pairs_hook=OrderedDict) logger.info('State loaded: %s' % os.path.abspath(self.state_file_path)) with open(self.config_file, 'r') as fh: self.config = json.load(fh, object_pairs_hook=OrderedDict) logger.info('Config loaded: %s' % os.path.abspath(self.config_file)) # Process resources - randomized if 'res' in self.config: res_tmp = self.config['res'] random.shuffle(res_tmp) for idx, res in enumerate(res_tmp): r = AccessResource(usr=res['usr'], token=res['token'], idx=idx) self.resources_list.append(r) self.resources_queue.put(r) logger.info('Resource %02d loaded: %s' % (idx, r.usr)) else: # unauth r = AccessResource(usr=None, token=None) self.resources_list.append(r) self.resources_queue.put(r) def init_db(self): """ Initializes database engine & session. Has to be done on main thread. :return: """ self.db_config = databaseutils.process_db_config(self.config['db']) from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, scoped_session self.engine = create_engine(self.db_config.constr, pool_recycle=3600) self.session = scoped_session(sessionmaker(bind=self.engine)) # Make sure tables are created DB_Base.metadata.create_all(self.engine) def init_workers(self): """ Initialize worker threads :return: """ logger.info('Starting %d working threads' % self.threads) for idx in range(self.threads): t = threading.Thread(target=self.work_thread_main, args=(idx, )) self.worker_threads.append(t) # Kick-off all threads for t in self.worker_threads: t.start() logger.info('Worker threads started') def cli(self): """ CLI thread :return: """ logger.info('CLI thread started') self.cmdloop() logger.info('Terminating CLI thread') def _init_queue(self): """ Initializes link queue :return: """ sess = self.session() logger.debug('Loading users...') users_cnt = 0 all_keys = sess.query(GitHubKey).filter( GitHubKey.is_interesting == 1).all() logger.debug('All users loaded') for rec in all_keys: users_cnt += 1 job = DownloadJob(url=self.USER_DETAIL_URL % rec.key_user_found, jtype=DownloadJob.TYPE_USER, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) job = DownloadJob(url=self.USER_ORGS_URL % rec.key_user_found, jtype=DownloadJob.TYPE_ORG, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) job = DownloadJob(url=self.USER_REPOS_URL % rec.key_user_found, jtype=DownloadJob.TYPE_REPOS_USER, meta={ 'user': rec.key_user_found, 'user_id': rec.key_user_id_found }) self.link_queue.put(job) logger.info('Queue initialized, users cnt: %s' % users_cnt) utils.silent_close(sess) def _init_queue_assignee(self): """ Init queue with assignees repos :return: """ sess = self.session() logger.debug('Loading repos...') users_cnt = 0 repos = sess.query(GitHubRepo).filter( GitHubRepo.user_repo == 0).filter( GitHubRepo.repo_stars >= 100).all() logger.debug('Repos loaded: %s' % len(repos)) for rec in repos: users_cnt += 1 job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL % rec.repo_name, jtype=DownloadJob.TYPE_REPO_ASSIGNEE, meta={ 'user': rec.owner_login, 'user_id': rec.owner_id, 'repo': rec.repo_name, 'page': 1 }) self.link_queue.put(job) def work(self): """ Main thread work method :return: """ # Interrupt signals signal.signal(signal.SIGINT, self.signal_handler) self.init_config() self.init_db() # Resume last state self.state_resume() # Monitor threads. self.state_thread = threading.Thread(target=self.state_main, args=()) self.state_thread.start() # If there is no link to process - create from since. if self.link_queue.qsize() == 0: if self.args.assign_only: self._init_queue_assignee() else: self._init_queue() # Worker threads self.init_workers() logger.info('Main thread started %s %s %s' % (os.getpid(), os.getppid(), threading.current_thread())) # CLI thread cli_thread = threading.Thread(target=self.cli, args=()) cli_thread.setDaemon(True) cli_thread.start() # Join on workers self.after_loop() logger.info('Terminating main thread') return None def after_loop(self, wait_for_state=True): """ After work loop finishes :return: """ logger.info('Waiting termination of slave threads') # Wait here for termination of all workers and monitors. try: for t in self.worker_threads: t.join() if wait_for_state: self.state_thread.join() except: logger.error('Exception during thread join') logger.error(traceback.format_exc()) logger.info('All threads terminates, last state save') self.state_save() # # General link processing & queue management # def work_thread_main(self, idx): """ Worker thread main loop :return: """ self.local_data.idx = idx logger.info('Working thread %d started' % idx) while not self.terminate and not self.stop_event.is_set(): self.interruptible_sleep_delta(0.1) # Get credential to process link with resource = self.resource_allocate() if resource is None: continue # We have resource, now get the job job = None try: job = self.link_queue.get(True, timeout=1.0) except Queue.Empty: self.resource_return(resource) continue # If job last fail is too recent - put again back to queue if time.time() - job.last_fail < 3.0: self.link_queue.put( job ) # re-insert to the back of the queue for later processing self.resource_return(resource) continue # Job processing starts here - fetch data page with the resource. js_data = None try: self.local_data.job = job self.local_data.resource = resource js_data, headers, raw_response = self.load_page_local() except RateLimitHit as e: logger.error( '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s' % (idx, job.url, job.fail_cnt, resource.usr, e)) continue except Exception as e: logger.error( '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s' % (idx, job.url, job.fail_cnt, resource.usr, e)) self.on_job_failed(job) continue finally: self.resource_return(resource) self.local_data.resource = None self.local_data.last_usr = resource.usr self.local_data.last_remaining = resource.remaining # Process downloaded data here. try: if js_data is None: self.audit_log('404', job.url, jtype=job.type, job=job) self.flush_audit() continue self.process_downloaded(job, js_data, headers, raw_response) except Exception as e: logger.error( '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s' % (idx, job.type, job.url, job.fail_cnt, resource.usr, e)) traceback.print_exc() self.on_job_failed(job) finally: self.local_data.resource = None self.local_data.job = None self.local_data.last_usr = None self.local_data.last_remaining = None resource = None job = None headers = None raw_response = None pass logger.info('Terminating worker thread %d' % idx) def on_job_failed(self, job): """ If job failed, this teaches it how to behave :param job: :return: """ job.fail_cnt += 1 job.last_fail = time.time() # if failed too many times - log and discard. if job.fail_cnt > 35: logger.warning('Job failed too many times %s' % job.url) self.audit_log('too-many-fails', job.url, jtype=job.type, job=job) self.flush_audit() else: self.link_queue.put( job) # re-insert to the queue for later processing def load_page_local(self): """ Loads page stored in thread local :return: """ auth = None resource = self.local_data.resource if resource.usr is not None: auth = HTTPBasicAuth(resource.usr, resource.token) job = self.local_data.job res = requests.get(job.url, timeout=10, auth=auth) headers = res.headers resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset')) resource.remaining = utils.try_int( headers.get('X-RateLimit-Remaining')) resource.last_used = time.time() resource.used_cnt += 1 if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10: resource.fail_cnt += 1 raise RateLimitHit if res.status_code == 404: resource.fail_cnt += 1 logger.warning('URL not found: %s' % job.url) return None, None, None if res.status_code // 100 != 2: resource.fail_cnt += 1 res.raise_for_status() data = res.content if data is None: resource.fail_cnt += 1 raise Exception('Empty response') js = json.loads(data, object_pairs_hook=OrderedDict) return js, headers, res # # Crawl specific methods # def process_downloaded(self, job, js_data, headers, raw_response): """ Process downloaded page :param self: :param job: :param js_data: :param headers: :param raw_response: :return: """ if job.type == DownloadJob.TYPE_USER: self.process_user(job, js_data, headers, raw_response) elif job.type == DownloadJob.TYPE_ORG: self.process_org(job, js_data, headers, raw_response) elif job.type == DownloadJob.TYPE_REPOS_USER: self.process_repo(job, js_data, headers, raw_response, True) elif job.type == DownloadJob.TYPE_REPOS_ORG: self.process_repo(job, js_data, headers, raw_response, False) elif job.type == DownloadJob.TYPE_REPO_COLAB: self.process_colab(job, js_data, headers, raw_response) elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE: self.process_assignee(job, js_data, headers, raw_response) else: logger.error('Unrecognized type %s' % job.type) def process_user(self, job, js, headers, raw_response): """ Process user detail data :param job: :param js: :param headers: :param raw_response: :return: """ if 'id' not in js: logger.error('Field ID not found in user') return s = self.session() try: user_id = int(js['id']) dbu = s.query(GitHubUserDetails).filter( GitHubUserDetails.id == user_id).one_or_none() is_new = False if dbu is None: is_new = True dbu = GitHubUserDetails() dbu.id = user_id dbu.date_last_check = salch.func.now() dbu.username = js['login'] dbu.name = utils.utf8ize(utils.defvalkey(js, 'name')) dbu.company = utils.utf8ize(utils.defvalkey(js, 'company')) dbu.blog = utils.defvalkey(js, 'blog') dbu.email = utils.defvalkey(js, 'email') dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio')) dbu.usr_type = utils.defvalkey(js, 'type') dbu.public_repos = js['public_repos'] dbu.public_gists = js['public_gists'] dbu.followers = js['followers'] dbu.following = js['following'] dbu.created_at = utils.dt_norm( utils.try_parse_timestamp(utils.defvalkey(js, 'created_at'))) dbu.updated_at = utils.dt_norm( utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at'))) if is_new: s.add(dbu) else: s.merge(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing user details: %s: %s' % (js['id'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) def process_org(self, job, js, headers, raw_response): """ Process user -> orgs data :param job: :param js: :param headers: :param raw_response: :return: """ new_orgs = [] for org in js: if 'id' not in org: logger.error('Field ID not found in orgs') continue s = self.session() try: org_id = int(org['id']) # delete first - avoid excs s.query(GitHubUserOrgs)\ .filter(GitHubUserOrgs.org_id == org_id)\ .filter(GitHubUserOrgs.username == job.meta['user'])\ .delete() dbu = GitHubUserOrgs() dbu.username = job.meta['user'] dbu.org_id = org['id'] dbu.org_name = org['login'] dbu.org_desc = utils.utf8ize(org['description']) new_orgs.append(org['login']) s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing user->org details: %s: %s' % (org['id'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_ORG, meta=new_meta) self.link_queue.put(job) # Load repositories for new organisations not_loaded_orgs = None with self.orgs_loaded_lock: new_orgs_set = set(new_orgs) not_loaded_orgs = new_orgs_set - self.orgs_loaded_set for x in new_orgs: self.orgs_loaded_set.add(x) for x in not_loaded_orgs: new_meta = dict(job.meta) new_meta['page'] = 1 new_meta['org'] = x job = DownloadJob(url=self.ORG_REPOS_URL % x, jtype=DownloadJob.TYPE_REPOS_ORG, meta=new_meta) self.link_queue.put(job) def process_repo(self, job, js, headers, raw_response, from_user): """ Process repo list page :param job: :param js: :param headers: :param raw_response: :param from_user: :return: """ for repo in js: if 'id' not in repo: logger.error('Field ID not found in repos') continue s = self.session() try: repo_id = int(repo['id']) dbe = s.query(GitHubRepo).filter( GitHubRepo.id == repo_id).one_or_none() dbu = GitHubRepo() dbu.id = repo_id dbu.user_repo = from_user if from_user: dbu.username = job.meta['user'] else: dbu.org_name = job.meta['org'] if 'owner' in repo: dbu.owner_id = repo['owner']['id'] dbu.owner_login = repo['owner']['login'] dbu.repo_name = repo['full_name'] dbu.repo_stars = repo['stargazers_count'] dbu.repo_forks = repo['forks'] dbu.repo_watchers = repo['watchers'] dbu.repo_is_fork = repo['fork'] dbu.repo_size = repo['size'] dbu.repo_homepage = utils.defvalkey(repo, 'homepage') dbu.repo_language = utils.defvalkey(repo, 'language') dbu.created_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'created_at'))) dbu.updated_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'updated_at'))) dbu.pushed_at = utils.dt_norm( utils.try_parse_timestamp( utils.defvalkey(repo, 'pushed_at'))) dbu.repo_description = utils.utf8ize(repo['description']) dbu.repo_stargazers_url = repo['stargazers_url'] dbu.repo_forks_url = repo['forks_url'] if not from_user and repo['stargazers_count'] > 100: new_meta = dict(job.meta) new_meta['page'] = 1 new_meta['repo'] = repo['full_name'] new_meta['owner'] = repo['owner']['login'] # Colab fetch - skip, no auth job = DownloadJob(url=self.ORG_REPO_COLAB_URL % (repo['full_name']), jtype=DownloadJob.TYPE_REPO_COLAB, meta=new_meta) # Asignee fetch job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL % (repo['full_name']), jtype=DownloadJob.TYPE_REPO_ASSIGNEE, meta=dict(new_meta)) self.link_queue.put(job) # DB save if dbe is None: s.add(dbu) else: if dbe.username != dbu.username: logger.warning('Username does not match for %s %s %s' % (repo_id, dbe.username, dbu.username)) if dbe.org_name != dbu.org_name: logger.warning('org_name does not match for %s %s %s' % (repo_id, dbe.org_name, dbu.org_name)) if dbe.owner_login != dbu.owner_login: logger.warning( 'owner_login does not match for %s %s %s' % (repo_id, dbe.owner_login, dbu.owner_login)) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error( 'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s' % (repo['id'], repo['full_name'], json.dumps( job.meta), job.url, e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 if from_user: new_url = (self.USER_REPOS_URL % job.meta['user']) + ('?page=%s' % (cur_page + 1)) job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPOS_USER, meta=new_meta) else: new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' % (cur_page + 1)) job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPOS_ORG, meta=new_meta) self.link_queue.put(job) def process_colab(self, job, js, headers, raw_response): """ Process colaborators for org owned repos :param job: :param js: :param headers: :param raw_response: :return: """ for colab in js: if 'id' not in colab: logger.error('Field ID not found in colab') continue s = self.session() try: # delete first - avoid excs s.query(GitHubRepoColab)\ .filter(GitHubRepoColab.user_name == colab['login'])\ .filter(GitHubRepoColab.repo_name == job.meta['repo'])\ .delete() dbu = GitHubRepoColab() dbu.repo_name = job.meta['repo'] dbu.user_name = colab['login'] dbu.can_pull = colab['permissions']['pull'] dbu.can_push = colab['permissions']['push'] dbu.can_admin = colab['permissions']['admin'] s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing colab details: %s:%s: %s' % (colab['login'], job.meta['repo'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.ORG_REPO_COLAB_URL % (job.meta['repo'])) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPO_COLAB, meta=new_meta) self.link_queue.put(job) def process_assignee(self, job, js, headers, raw_response): """ Process assignees for org owned repos :param job: :param js: :param headers: :param raw_response: :return: """ for assignee in js: if 'id' not in assignee: logger.error('Field ID not found in assignees') continue s = self.session() try: # delete first - avoid excs s.query(GitHubRepoAssignee)\ .filter(GitHubRepoAssignee.user_name == assignee['login'])\ .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\ .delete() dbu = GitHubRepoAssignee() dbu.repo_name = job.meta['repo'] dbu.user_name = assignee['login'] s.add(dbu) s.commit() s.flush() s.expunge_all() except Exception as e: logger.error('Exception storing cassignee details: %s:%s: %s' % (assignee['login'], job.meta['repo'], e)) logger.debug(traceback.format_exc()) finally: utils.silent_close(s) if len(js) == 0: return # Load next page cur_page = utils.defvalkey(job.meta, 'page', 1) new_url = (self.ORG_REPO_ASSIGNEES_URL % (job.meta['repo'])) + ('?page=%s' % (cur_page + 1)) new_meta = dict(job.meta) new_meta['page'] = cur_page + 1 job = DownloadJob(url=new_url, jtype=DownloadJob.TYPE_REPO_ASSIGNEE, meta=new_meta) self.link_queue.put(job) # # Resource management # def resource_allocate(self, blocking=True, timeout=1.0): """ Takes resource from the pool. If the resource has low remaining credit, thread is suspended to re-charge. :return: resource or None if not available in the time """ try: resource = self.resources_queue.get(True, timeout=1.0) if resource.remaining is not None and resource.remaining <= self.threads + 2: sleep_sec = resource.reset_time - time.time( ) if resource.reset_time is not None else 15 * 60 sleep_sec += 120 # extra 2 minutes to avoid problems with resources logger.info( 'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, ' '%d minutes' % (resource.usr, resource.remaining, resource.reset_time, sleep_sec, sleep_sec / 60.0)) self.sleep_interruptible(time.time() + sleep_sec) logger.info('Resource sleep finished %s' % resource.usr) # Reset estimations, needs to be refreshed resource.remaining = None resource.reset_time = None return resource except Queue.Empty: return None def resource_return(self, res): """ Returns resource to the pool :param res: :return: """ self.resources_queue.put(res) def sleep_interruptible(self, until_time): """ Interruptible sleep - sleep until given time. :param until_time: :return: """ while time.time() <= until_time: time.sleep(1.0) if self.terminate or self.stop_event.is_set(): return def interruptible_sleep_delta(self, sleep_time): """ Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread. :param sleep_time: :return: """ if sleep_time is None: return sleep_time = float(sleep_time) if sleep_time == 0: return sleep_start = time.time() while not self.stop_event.is_set() and not self.terminate: time.sleep(0.1) if time.time() - sleep_start >= sleep_time: return # # State # def flush_state(self): """ Flushes state/config to the state file :return: """ self.state['rate_limit_remaining'] = self.rate_limit_remaining self.state['rate_limit_reset'] = self.rate_limit_reset utils.flush_json(self.state, self.state_file_path) # # Auditing - errors, problems for further analysis # def audit_log(self, evt=None, link=None, jtype=None, job=None): """ Appends audit log to the buffer. Lock protected. :param evt: :param link: :return: """ log = collections.OrderedDict() log['time'] = time.time() log['evt'] = evt log['jtype'] = jtype log['link'] = link if job is not None and isinstance(job, DownloadJob): log['job'] = job.to_json() with self.audit_lock: self.audit_records_buffered.append(log) def flush_audit(self): """ Flushes audit logs to the JSON append only file. Routine protected by the lock (no new audit record can be inserted while holding the lock) :return: """ if self.audit_file is None: self.audit_records_buffered = [] return self.audit_lock.acquire() try: if len(self.audit_records_buffered) == 0: return with open(self.audit_file, 'a') as fa: for x in self.audit_records_buffered: fa.write(json.dumps(x, cls=utils.AutoJSONEncoder) + "\n") self.audit_records_buffered = [] except Exception as e: logger.error('Exception in audit log dump %s' % e) finally: self.audit_lock.release() # # State save / resume # def state_main(self): """ State thread - periodical dump of the queues. :return: """ logger.info('State thread started %s %s %s' % (os.getpid(), os.getppid(), threading.current_thread())) try: while not self.stop_event.is_set() and not self.terminate: # Dump stats each x seconds # Sleep is here because of dumping the state for the last time just before program quits. self.interruptible_sleep_delta(2) self.state_save() # Check memory conditions self.state_ram_check() except Exception as e: traceback.print_exc() logger.error('Exception in state: %s' % e) finally: pass logger.info('State loop terminated') def state_ram_check(self): """ Checks memory terminating conditions :return: """ if self.max_mem is None: return cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss if cur_ram <= self.max_mem: return logger.warning( 'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB' % (cur_ram, cur_ram / 1024.0, self.max_mem)) self.trigger_stop() def state_gen(self): """ Dumps state :return: """ def link_char(job): if job.type == DownloadJob.TYPE_USER: return 'U' elif job.type == DownloadJob.TYPE_ORG: return 'o' elif job.type == DownloadJob.TYPE_REPOS_USER: return '.' elif job.type == DownloadJob.TYPE_REPOS_ORG: return ',' elif job.type == DownloadJob.TYPE_REPO_COLAB: return 'c' elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE: return '-' else: return '!' try: js_q = collections.OrderedDict() js_q['gen'] = time.time() js_q['link_size'] = self.link_queue.qsize() js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # Dequeues self.new_users_events.maintain() self.new_keys_events.maintain() users_in_5min = self.new_users_events.under_limit(5 * 60) keys_in_5min = self.new_keys_events.under_limit(5 * 60) js_q['users_dequeue_size'] = self.new_users_events.len() js_q['keys_dequeue_size'] = self.new_keys_events.len() js_q['users_5min'] = users_in_5min js_q['keys_5min'] = keys_in_5min js_q['users_1min'] = users_in_5min / 5.0 js_q['keys_1min'] = keys_in_5min / 5.0 # link queue structure qdata = list(self.link_queue.queue) qdata.sort(cmp=DownloadJob.cmp) js_q['link_structure'] = ''.join([link_char(x) for x in qdata]) # Stats. js_q['resource_stats'] = [ x.to_json() for x in list(self.resources_list) ] # Finally - the queue js_q['link_queue'] = [x.to_json() for x in qdata] return js_q except Exception as e: traceback.print_exc() logger.error('Exception in state: %s', e) def state_save(self): """ saves the state :return: """ try: js_q = self.state_gen() utils.flush_json(js_q, self.state_file_path) except Exception as e: traceback.print_exc() logger.error('Exception in state: %s', e) def state_resume(self): """ Attempts to resume the queues from the monitoring files :return: """ try: if self.state is None: return if 'link_queue' in self.state: for rec in self.state['link_queue']: job = DownloadJob.from_json(rec) self.link_queue.put(job) logger.info('Link queue resumed, entries: %d' % len(self.state['link_queue'])) except Exception as e: traceback.print_exc() logger.warning('Exception in resuming the state %s' % e) logger.error('State resume failed, exiting') sys.exit(1) def main(self): """ Main entry point :return: """ args_src = sys.argv parser = argparse.ArgumentParser( description='Downloads GitHub User info') parser.add_argument('-c', dest='config', default=None, help='JSON config file') parser.add_argument('-s', dest='status', default=None, help='JSON status file') parser.add_argument('-t', dest='threads', default=1, type=int, help='Number of download threads to use') parser.add_argument( '--max-mem', dest='max_mem', default=None, type=int, help='Maximal memory threshold in kB when program terminates itself' ) parser.add_argument('--assign-only', dest='assign_only', default=False, action='store_const', const=True, help='fetch assignees') args = self.args = parser.parse_args(args=args_src[1:]) self.threads = args.threads self.max_mem = args.max_mem self.config_file = args.config self.audit_file = os.path.join(os.getcwd(), 'audit.json') self.state_file_path = args.status if args.status is not None else os.path.join( os.getcwd(), 'state.json') if os.path.exists(self.state_file_path): utils.file_backup(self.state_file_path, backup_dir='.') if os.path.exists('.github-quit'): os.remove('.github-quit') sys.argv = [args_src[0]] logger.info('GitHub loader started, args: %s' % args) self.work() sys.argv = args_src