Example #1
0
def main():
    n_max = 10000  # n first bboxes to process
    n_frames_max = 3000
    mask_thresh = 35

    videos_dir = "data/stanford_drone/videos/"

    scene_names = ["bookstore", "coupa", "deathCircle", "gates", "hyang", "little", "nexus", "quad"]
    # val_videos_paths = [os.path.join(videos_dir, scene, "video0") for scene in scene_names]
    # show_and_save_video("data/stanford_drone/videos/coupa/video2", n_max, n_frames_max, mask_thresh)
    # exit(1)

    # to show [0!(hard dark low res), 1(car),4], 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality),
    # final to show 1(car), 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality)
    i_video = 0
    for scene in scene_names:
        scene_subdirs = os.listdir(os.path.join(videos_dir, scene))
        for vid_dir_name in scene_subdirs:
            tracker = SummaryTracker()

            vid_dir = os.path.join(videos_dir, scene, vid_dir_name)
            print("-----------------")
            print(i_video, vid_dir)
            # "data/stanford_drone/videos/coupa/video2"
            store_as_dataset(vid_dir, n_max, n_frames_max, mask_thresh)
            i_video += 1

            tracker.print_diff()
Example #2
0
def test_SelectMemoryLeak():
    N = 50000

    def table_set(t, state):
        fvs = swsscommon.FieldValuePairs([("status", state)])
        t.set("123", fvs)

    def generator_SelectMemoryLeak():
        app_db = swsscommon.DBConnector("APPL_DB", 0, True)
        t = swsscommon.Table(app_db, "TABLE")
        for i in range(int(N / 2)):
            table_set(t, "up")
            table_set(t, "down")

    tracker = SummaryTracker()
    appl_db = swsscommon.DBConnector("APPL_DB", 0, True)
    sel = swsscommon.Select()
    sst = swsscommon.SubscriberStateTable(appl_db, "TABLE")
    sel.addSelectable(sst)
    thr = Thread(target=generator_SelectMemoryLeak)
    thr.daemon = True
    thr.start()
    time.sleep(5)
    for _ in range(N):
        state, c = sel.select(1000)
    diff = tracker.diff()
    cases = []
    for name, count, _ in diff:
        if count >= N:
            cases.append("%s - %d objects for %d repeats" % (name, count, N))
    thr.join()
    assert not cases
def get_memory_tracker():
    global _memory_tracker
    if _memory_tracker is None:
        from pympler.tracker import SummaryTracker

        _memory_tracker = SummaryTracker()
    return _memory_tracker
Example #4
0
def test_reopt_sizing_pvwatts(solar_resource):
    round = 0

    tracker = SummaryTracker()
    while round < 25:   # multiple runs required to check for memory leaks
        round += 1

        sys = pv.default("PVWattsBatteryCommercial")
        sys.SolarResource.solar_resource_file = solar_resource
        batt = bt.from_existing(sys, "PVWattsBatteryCommercial")
        sys.SolarResource.solar_resource_data = dict({'lat': 3, 'lon': 3})
        batt.Battery.crit_load = [0] * 8760
        fin = ur.from_existing(sys, "PVWattsBatteryCommercial")

        post = sys.Reopt_size_battery_post()

    assert('Scenario' in post['reopt_post'])
    assert(post['reopt_post']['Scenario']['Site']['latitude'] == pytest.approx(3, 0.1))
    tracker_diff = tracker.diff()
    tracker.print_diff()
Example #5
0
 def __init__(self, lock, queue, sniffer, analyzer, AerospikeClient):
     self.lock = lock
     self.queue = queue
     self.sniffer = sniffer
     self.analyzer = analyzer
     self.timestamp = ""
     self.logger = logging.getLogger(__name__)
     self.attack_time, self.attack_percent = ConfigurationReader.scriptExecutionInfo(
     )
     self.first_value = 0
     self.attack_alert = 0
     self.Aero = AerospikeClient
     self.attack_alert_limit = (
         (self.attack_time * 10) / 100) * self.attack_percent
     self.custom_script_counter = ConfigurationReader.higherScriptExecution(
     )
     self.script_execution_counter = self.custom_script_counter
     self.IsAttack = 0
     self.selftracker = SummaryTracker()
Example #6
0
    def start(self):
        if self.debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()
        if self.debugging_level >= 2:
            from pympler.tracker import SummaryTracker
            tracker = SummaryTracker()

        # context must be created here because this function may be executed in a separate
        # process/thread
        self.connect()
        heartbeat_ts = time.time()

        # GC causes unplanned stall and disrupts precisely timed collection.
        # Disable it and do it manually before sleeping.
        gc.disable()
        try:
            logger.info('MA started')
            while not self.stopped:
                if self.collectors:
                    ts = time.time()
                    if ts - (
                            self.last_collect_second +
                            self.collect_time_decimal) >= self.tick_len - 0.01:
                        # This must be updated *before* collecting to prevent the send time from
                        # slowly drifting away
                        self.last_collect_second = int(ts)
                        result = []
                        for c in self.collectors:
                            result.extend(c())
                        logger.info('Collected: ' + str(result))
                        self.timestamp_and_send_obj(result, ts)
                    else:
                        pass
                else:
                    self.last_collect_second = time.time()

                gc.collect()
                flush_log()

                # Print out memory usage every minute
                if self.debugging_level >= 2 and int(time.time()) % 60 == 0:
                    print('Time: ' + time.asctime(time.localtime(time.time())))
                    tracker.print_diff()

                # Calculate the precise time for next collection
                sleep_second = self.last_collect_second + self.collect_time_decimal + self.tick_len - time.time(
                )
                sleep_second = max(sleep_second, 0)

                sleep_start_ts = time.time()
                p = dict(self.poller.poll(sleep_second * 1000))
                logger.debug('Slept {0} seconds'.format(time.time() -
                                                        sleep_start_ts))
                if self.socket in p:
                    req = pickle.loads(zlib.decompress(self.socket.recv()))
                    assert req[0] == LustreCommon.protocol_ver
                    heartbeat_ts = time.time()
                    if isinstance(req[2], bytes):
                        # this is a command, not data
                        cmd = req[2]
                        if cmd == b'ACTION':
                            action = req[3]
                            if action == 0:
                                logger.info('Received action 0, ignored')
                            else:
                                logger.info(
                                    'Performing action {action}'.format(
                                        action=action))
                                self.controller(req[3:])
                        elif cmd == b'HB':
                            logger.debug('Received heartbeat')
                        else:
                            logger.warning('Unknown command received: ' + cmd)
                    else:
                        logger.error('Corrupted message received: ' + req)
                else:
                    if heartbeat_ts and time.time() - heartbeat_ts > 5:
                        # reconnect
                        self.disconnect()
                        self.connect()

                        logger.warning('Connection timeout, reconnected')
                        heartbeat_ts = time.time()

            logger.info('MA stopped')
        finally:
            gc.enable()

            if self.debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())
Example #7
0
import psutil
from pympler.tracker import SummaryTracker

_tr = SummaryTracker()
_print_mem_enabled = False
_mem_used = 0
_prev_mem_used = 0


def disable_print_mem():
    global _print_mem_enabled
    _print_mem_enabled = False


def enable_print_mem():
    global _print_mem_enabled
    _print_mem_enabled = True


def print_mem_usage():
    global _mem_used, _prev_mem_used, _print_mem_enabled

    if not _print_mem_enabled:
        return

    # _tr.print_diff()
    mem = psutil.virtual_memory()
    _prev_mem_used = _mem_used
    _mem_used = (mem.total - mem.available) / 1024 / 1024
    print(f"virt_mem >> used: {_mem_used:.0f}, prev: {_prev_mem_used:.0f}, "
          f"diff: {_mem_used - _prev_mem_used:.0f}")
Example #8
0
from pympler.tracker import SummaryTracker

tracker = SummaryTracker()
tracker.print_diff()
s = "str"
tracker.print_diff()
Example #9
0
def add_single_experiment(directory, df_filepath, datasets):
    csv_filenames = [
        f for f in os.listdir(directory)
        if f[0:5] == "resul" and f[-4:] == ".csv"
    ]
    print(list(os.walk(directory)))
    weights_folder = [f for f in os.walk(directory)][0][1][0]
    for filename in csv_filenames:
        experiment = directory_to_experiment_info(directory)
        with open("./" + directory + "/" + filename, "r") as csvfile:
            logging.debug("./" + directory + "/" + filename)
            plots = csv.reader(csvfile, delimiter=",")
            headers = next(plots, None)
            for header in headers:
                experiment[header] = []
            for row in plots:
                for i, header in enumerate(headers):
                    experiment[header].append(float(row[i]))

        experiment["minimum_val_loss"] = min(experiment["val_loss"])
        experiment["minimum_loss"] = min(experiment["loss"])
        number = int(filename.split("-")[-1].split(".")[0])
        experiment["number"] = number
        experiment["repeat"] = math.floor(number / 4)
        if (number % 4) / 2 < 1:
            experiment["elastic_distortions"] = True
        else:
            experiment["elastic_distortions"] = False
        if (number % 4) % 2 != 0:
            experiment["separate_channel_ops"] = True
        else:
            experiment["separate_channel_ops"] = False
        print(experiment.keys())

        if not experiment_in_dataframe(df_filepath, experiment):
            weights_file = (directory + "/" + weights_folder + "/" +
                            "mask_rcnn_fk2018_best.h5")
            config = FK2018.FKConfig()
            print("got config")

            class InferenceConfig(config.__class__):
                # Run detection on one image at a time
                GPU_COUNT = 1
                IMAGES_PER_GPU = 1

            config = InferenceConfig()
            config.display()
            # Device to load the neural network on.
            # Useful if you're training a model on the same
            # machine, in which case use CPU and leave the
            # GPU for training.
            # DEVICE = "/gpu:0"  # /cpu:0 or /gpu:0

            # Inspect the model in training or inference modes
            # values: 'inference' or 'training'
            # TODO: code for 'training' test mode not ready yet
            TEST_MODE = "inference"

            # Must call before using the dataset

            # with tf.device(DEVICE):
            tf_config = tf.ConfigProto()
            tf_config.gpu_options.allow_growth = True
            with tf.Session(config=tf_config).as_default():
                model = modellib.MaskRCNN(mode="inference",
                                          model_dir="./log",
                                          config=config)

                # Load weights
                logging.debug("Loading weights " + str(weights_file))
                model.load_weights(weights_file, by_name=True)
                image_ids = datasets[0].image_ids

                tracker = SummaryTracker()
                logging.debug("getting stats")
                logging.debug("tunasand stats")

                experiment["AP_list"], \
                experiment["classless_AP_list"], \
                experiment["precision_list"], \
                experiment["classless_precision_list"], \
                experiment["recall_list"], \
                experiment["classless_recall_list"], \
                experiment["predicted_class_list"], \
                experiment["gt_class_list"], \
                experiment["predicted_size_list"], \
                experiment["gt_size_list"], \
                experiment["overlaps"], \
                experiment["classless_overlaps_list"], \
                experiment["total_predicted_pixels"], \
                experiment["total_groundtruth_pixels"], \
                experiment["total_overlapping_pixels"] = compute_both_batch_aps(image_ids, datasets[0], model, config) #get_stats(weights_file, datasets[0])

                objgraph.show_most_common_types()
                roots = objgraph.get_leaking_objects()
                print(len(roots))
                tracker.print_diff()

                for i, dataset in enumerate(
                    ["AE_area1", "AE_area2", "AE_area3"]):
                    image_ids = datasets[i + 1].image_ids
                    logging.debug(f"aestats, {dataset}")
                    experiment[f"AP_list_{dataset}"], \
                    experiment[f"classless_AP_list_{dataset}"], \
                    experiment[f"precision_list_{dataset}"], \
                    experiment[f"classless_precision_list_{dataset}"], \
                    experiment[f"recall_list_{dataset}"], \
                    experiment[f"classless_recall_list_{dataset}"], \
                    experiment[f"predicted_class_list_{dataset}"], \
                    experiment[f"gt_class_list_{dataset}"], \
                    experiment[f"predicted_size_list_{dataset}"], \
                    experiment[f"gt_size_list_{dataset}"], \
                    experiment[f"overlaps_{dataset}"], \
                    experiment[f"classless_overlaps_list_{dataset}"], \
                    experiment[f"total_predicted_pixels_{dataset}"], \
                    experiment[f"total_groundtruth_pixels_{dataset}"], \
                    experiment[f"total_overlapping_pixels_{dataset}"]  = compute_both_batch_aps(image_ids, datasets[i+1], model, config) #get_stats(weights_file, datasets[i+1])
                    objgraph.show_growth()
                    roots = objgraph.get_leaking_objects()
                    print(len(roots))
                    tracker.print_diff()

            update_dataframe(df_filepath, experiment)
        else:
            print("already in dataframe, skipping " + filename)
Example #10
0
import sys, os
import time
from pympler.tracker import SummaryTracker
import testsuites
import flaminglog.logrule as flaminglog

#runs a memory profiler
tracker = SummaryTracker()

print(tracker.print_diff())
for i in range(0,100):
    #runs all tests except cleanup
    testsuites.run_full_suite(0)
    if(i%10==0):
        print(tracker.print_diff())

#sleep to allow any garbage collection to fully finish
time.sleep(10)

print(tracker.print_diff())
print("If the '# objects' and the 'total size' are both low numbers then"
        " there is no memory leak")


Example #11
0
class GitHubLoader(Cmd):
    """
    GitHub SSH keys loader
    """
    prompt = '$> '

    LINK_FACTOR = 70
    USER_URL = 'https://api.github.com/users/%s'
    USERS_URL = 'https://api.github.com/users?since=%s'
    KEYS_URL = 'https://api.github.com/users/%s/keys'
    KEYS_ID_URL = 'https://api.github.com/user/%s/keys'

    def __init__(self,
                 attempts=5,
                 threads=1,
                 state=None,
                 state_file=None,
                 config_file=None,
                 audit_file=None,
                 max_mem=None,
                 users_only=False,
                 merge=False,
                 update_keys=False,
                 *args,
                 **kwargs):

        Cmd.__init__(self, *args, **kwargs)
        self.t = Terminal()
        self.trace_logger = Tracelogger(logger=logger)

        self.attempts = int(attempts)
        self.total = None
        self.terminate = False
        self.since_id = 0
        self.last_users_count = None
        self.user_lock = Lock()
        self.processed_user_set = set()
        self.processed_user_set_lock = Lock()

        self.max_mem = max_mem
        self.merge = merge
        self.update_keys = update_keys
        self.users_only = users_only
        self.users_per_page = 30
        self.users_bulk_load_pages = 500
        self.user_load_bulk = 5000
        self.user_refill_lock = Lock()
        self.state = state
        self.state_file_path = state_file
        self.rate_limit_reset = None
        self.rate_limit_remaining = None

        self.config = None
        self.config_file = config_file

        self.audit_file = audit_file
        self.audit_records_buffered = []
        self.audit_lock = Lock()

        self.stop_event = threading.Event()
        self.threads = int(threads)
        self.link_queue = Queue.PriorityQueue()  # Store links to download here
        self.worker_threads = []

        self.state_thread = None
        self.state_thread_lock = Lock()

        self.resources_list = []
        self.resources_queue = Queue.PriorityQueue()
        self.local_data = threading.local()

        self.new_users_events = EvtDequeue()
        self.new_keys_events = EvtDequeue()

        self.db_config = None
        self.engine = None
        self.session = None

        self.mem_tracker = None

    def signal_handler(self, signal, frame):
        """
        Signal handler - terminate gracefully
        :param signal:
        :param frame:
        :return:
        """
        logger.info('CTRL+C pressed')
        self.trigger_stop()

    def trigger_stop(self):
        """
        Sets terminal conditions to true
        :return:
        """
        self.terminate = True
        self.stop_event.set()

    def trigger_quit(self):
        """
        Terminal condition & file change
        :return:
        """
        self.trigger_stop()
        utils.try_touch('.github-quit')

    #
    # CMD handlers
    #

    def do_quit(self, arg):
        self.trigger_quit()
        logger.info('Waiting for thread termination')

        time.sleep(1)
        logger.info('Quitting')
        return Cmd.do_quit(self, arg)

    def do_reset(self, line):
        print('\033c')

    def do_gc(self, line):
        gc.collect()

    def do_mem_top(self, line):
        print(mem_top.mem_top())

    def do_mem_track_init(self, line):
        self.mem_tracker = SummaryTracker()

    def do_mem_track_diff(self, line):
        print(self.mem_tracker.print_diff())

    def do_mem_track_deinit(self, line):
        self.mem_tracker = None

    def do_mem(self, line):
        print('Memory usage: %s kB' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    def do_state(self, line):
        js = self.state_gen()

        if line is None or len(line) == 0:
            del js['link_queue']
            del js['resource_stats']
        elif line == '1':
            del js['link_queue']

        print(json.dumps(js, indent=2))

    def do_deq_enable(self, line):
        self.new_keys_events.disabled = False
        self.new_users_events.disabled = False

    def do_deq_disable(self, line):
        self.new_keys_events.disabled = True
        self.new_users_events.disabled = True

    #
    # Init
    #

    def init_config(self):
        """
        Loads config & state files
        :return:
        """
        if self.state_file_path is not None and os.path.exists(
                self.state_file_path):
            with open(self.state_file_path, 'r') as fh:
                self.state = json.load(fh, object_pairs_hook=OrderedDict)
                logger.info('State loaded: %s' %
                            os.path.abspath(self.state_file_path))

        with open(self.config_file, 'r') as fh:
            self.config = json.load(fh, object_pairs_hook=OrderedDict)
            logger.info('Config loaded: %s' %
                        os.path.abspath(self.config_file))

            if 'since_id' in self.config:
                self.since_id = self.config['since_id']

            # Process resources - randomized
            if 'res' in self.config:
                res_tmp = self.config['res']
                random.shuffle(res_tmp)
                for idx, res in enumerate(res_tmp):
                    r = AccessResource(usr=res['usr'],
                                       token=res['token'],
                                       idx=idx)
                    self.resources_list.append(r)
                    self.resources_queue.put(r)
                    logger.info('Resource %02d loaded: %s' % (idx, r.usr))
            else:
                # unauth
                r = AccessResource(usr=None, token=None)
                self.resources_list.append(r)
                self.resources_queue.put(r)

    def init_db(self):
        """
        Initializes database engine & session.
        Has to be done on main thread.
        :return:
        """
        self.db_config = databaseutils.process_db_config(self.config['db'])

        from sqlalchemy import create_engine
        from sqlalchemy.orm import sessionmaker, scoped_session
        self.engine = create_engine(self.db_config.constr, pool_recycle=3600)
        self.session = scoped_session(sessionmaker(bind=self.engine))

        # Make sure tables are created
        DB_Base.metadata.create_all(self.engine)

    def init_workers(self):
        """
        Initialize worker threads
        :return:
        """
        logger.info('Starting %d working threads' % self.threads)
        for idx in range(self.threads):
            t = threading.Thread(target=self.work_thread_main, args=(idx, ))
            self.worker_threads.append(t)

        # Kick-off all threads
        for t in self.worker_threads:
            t.start()

        logger.info('Worker threads started')

    def cli(self):
        """
        CLI thread
        :return:
        """
        logger.info('CLI thread started')
        self.cmdloop()
        logger.info('Terminating CLI thread')

    #
    # Operation
    #

    def work(self):
        """
        Main thread work method
        :return:
        """
        # Interrupt signals
        signal.signal(signal.SIGINT, self.signal_handler)

        self.init_config()
        self.init_db()

        # Resume last state
        self.state_resume()

        # Monitor threads.
        self.state_thread = threading.Thread(target=self.state_main, args=())
        self.state_thread.start()

        # If there is no link to process - create from since.
        if self.link_queue.qsize() == 0:
            self.kickoff_links()

        # Worker threads
        self.init_workers()

        logger.info('Main thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))

        # CLI thread
        cli_thread = threading.Thread(target=self.cli, args=())
        cli_thread.setDaemon(True)
        cli_thread.start()

        # Join on workers
        self.after_loop()
        logger.info('Terminating main thread')
        return None

    def after_loop(self, wait_for_state=True):
        """
        After work loop finishes
        :return:
        """
        logger.info('Waiting termination of slave threads')

        # Wait here for termination of all workers and monitors.
        try:
            for t in self.worker_threads:
                t.join()

            if wait_for_state:
                self.state_thread.join()
        except:
            logger.error('Exception during thread join')
            logger.error(traceback.format_exc())

        logger.info('All threads terminates, last state save')
        self.state_save()

    def work_thread_main(self, idx):
        """
        Worker thread main loop
        :return:
        """
        self.local_data.idx = idx
        logger.info('Working thread %d started' % idx)

        while not self.terminate and not self.stop_event.is_set():
            self.interruptible_sleep_delta(0.1)

            # Get credential to process link with
            resource = self.resource_allocate()
            if resource is None:
                continue

            # We have resource, now get the job
            job = None
            try:
                job = self.link_queue.get(True, timeout=1.0)
            except Queue.Empty:
                self.resource_return(resource)
                continue

            # If job last fail is too recent - put again back to queue
            if time.time() - job.last_fail < 3.0:
                self.link_queue.put(
                    job
                )  # re-insert to the back of the queue for later processing
                self.resource_return(resource)
                continue

            # Job processing starts here - fetch data page with the resource.
            js_data = None
            try:
                self.local_data.job = job
                self.local_data.resource = resource
                js_data, headers, raw_response = self.load_page_local()

            except RateLimitHit as e:
                logger.error(
                    '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))
                continue

            except Exception as e:
                logger.error(
                    '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))

                self.on_job_failed(job)
                continue

            finally:
                self.resource_return(resource)
                self.local_data.resource = None
                self.local_data.last_usr = resource.usr
                self.local_data.last_remaining = resource.remaining

            # Process downloaded data here.
            try:
                if js_data is None:
                    self.audit_log('404', job.url, jtype=job.type, job=job)
                    self.flush_audit()
                    continue

                if job.type == DownloadJob.TYPE_USERS:
                    self.process_users_data(job, js_data, headers,
                                            raw_response)
                else:
                    self.process_keys_data(job, js_data, headers, raw_response)

            except Exception as e:
                logger.error(
                    '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s'
                    % (idx, job.type, job.url, job.fail_cnt, resource.usr, e))

                self.trace_logger.log(e)
                self.on_job_failed(job)
            finally:
                self.local_data.resource = None
                self.local_data.job = None
                self.local_data.last_usr = None
                self.local_data.last_remaining = None
                resource = None
                job = None
                headers = None
                raw_response = None

        pass
        logger.info('Terminating worker thread %d' % idx)

    def on_job_failed(self, job):
        """
        If job failed, this teaches it how to behave
        :param job:
        :return:
        """
        job.fail_cnt += 1
        job.last_fail = time.time()

        # if failed too many times - log and discard.
        if job.fail_cnt > 35:
            logger.warning('Job failed too many times %s' % job.url)
            self.audit_log('too-many-fails', job.url, jtype=job.type, job=job)
            self.flush_audit()
        else:
            self.link_queue.put(
                job)  # re-insert to the queue for later processing

    def load_page_local(self):
        """
        Loads page stored in thread local
        :return:
        """

        auth = None
        resource = self.local_data.resource
        if resource.usr is not None:
            auth = HTTPBasicAuth(resource.usr, resource.token)

        job = self.local_data.job

        res = requests.get(job.url, timeout=10, auth=auth)
        headers = res.headers

        resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset'))
        resource.remaining = utils.try_int(
            headers.get('X-RateLimit-Remaining'))
        resource.last_used = time.time()
        resource.used_cnt += 1

        if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10:
            resource.fail_cnt += 1
            raise RateLimitHit

        if res.status_code == 404:
            resource.fail_cnt += 1
            logger.warning('URL not found: %s' % job.url)
            return None, None, None

        if res.status_code // 100 != 2:
            resource.fail_cnt += 1
            res.raise_for_status()

        data = res.content
        if data is None:
            resource.fail_cnt += 1
            raise Exception('Empty response')

        js = json.loads(data, object_pairs_hook=OrderedDict)
        return js, headers, res

    def resource_allocate(self, blocking=True, timeout=1.0):
        """
        Takes resource from the pool.
        If the resource has low remaining credit, thread is suspended to re-charge.
        :return: resource or None if not available in the time
        """
        try:
            resource = self.resources_queue.get(True, timeout=1.0)
            if resource.remaining is not None and resource.remaining <= self.threads + 2:
                sleep_sec = resource.reset_time - time.time(
                ) if resource.reset_time is not None else 15 * 60
                sleep_sec += 120  # extra 2 minutes to avoid problems with resources

                logger.info(
                    'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, '
                    '%d minutes' %
                    (resource.usr, resource.remaining, resource.reset_time,
                     sleep_sec, sleep_sec / 60.0))
                self.sleep_interruptible(time.time() + sleep_sec)
                logger.info('Resource sleep finished %s' % resource.usr)

                # Reset estimations, needs to be refreshed
                resource.remaining = None
                resource.reset_time = None

            return resource

        except Queue.Empty:
            return None

    def resource_return(self, res):
        """
        Returns resource to the pool
        :param res:
        :return:
        """
        self.resources_queue.put(res)

    def sleep_interruptible(self, until_time):
        """
        Interruptible sleep - sleep until given time.
        :param until_time:
        :return:
        """
        while time.time() <= until_time:
            time.sleep(1.0)
            if self.terminate or self.stop_event.is_set():
                return

    def interruptible_sleep_delta(self, sleep_time):
        """
        Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread.
        :param sleep_time:
        :return:
        """
        if sleep_time is None:
            return

        sleep_time = float(sleep_time)

        if sleep_time == 0:
            return

        sleep_start = time.time()
        while not self.stop_event.is_set() and not self.terminate:
            time.sleep(0.1)
            if time.time() - sleep_start >= sleep_time:
                return

    #
    # Parser and processing logic
    #

    def kickoff_links(self):
        """
        Kick off the scrapping by adding initial links to the queue
        :return:
        """
        if self.update_keys:
            self.fill_user_key_links()

        else:
            job = DownloadJob(url=self.USERS_URL % self.since_id,
                              jtype=DownloadJob.TYPE_USERS)
            self.link_queue.put(job)
            logger.info('Kickoff link added: %s' % job.url)

    def process_users_data(self, job, js, headers, raw_response):
        """
        Process user data - produce keys links + next user link
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        max_id = 0
        github_users = []
        cur_time = int(time.time())
        for user in js:
            if 'id' not in user:
                logger.error('Field ID not found in user')
                continue

            github_user = GitHubUser(user_id=int(user['id']),
                                     user_name=user['login'],
                                     user_type=user['type'],
                                     user_url=user['url'])
            github_users.append(github_user)

            if github_user.user_id > max_id:
                max_id = github_user.user_id

            if self.users_only:
                continue

            key_url = '%s/keys' % github_user.user_url
            new_job = DownloadJob(url=key_url,
                                  jtype=DownloadJob.TYPE_KEYS,
                                  user=github_user,
                                  priority=random.randint(0, 1000),
                                  time_added=cur_time)
            self.link_queue.put(new_job)

        # Link with the maximal user id
        users_url = self.USERS_URL % max_id
        new_job = DownloadJob(url=users_url,
                              jtype=DownloadJob.TYPE_USERS,
                              time_added=cur_time)

        # Optimizing the position of this link in the link queue
        queue_size = self.link_queue.qsize()
        queue_size_max = self.LINK_FACTOR * self.threads
        fill_up_ratio = queue_size / float(queue_size_max)

        # Key jobs are uniformly distributed on priorities 0...1000.
        # To increase queue size pick priority closer to 1000, do decrease, closer to 0
        priority = random.randint(0, 500)
        if queue_size < queue_size_max:
            priority = int((1 - fill_up_ratio) * 5000) + 500
        if queue_size > 3 * queue_size_max:
            priority = 0

        new_job.priority = priority
        lucky_one = False
        with self.user_lock:
            if self.since_id < max_id:
                self.since_id = max_id
                self.link_queue.put(new_job)
                lucky_one = True

                # Bulk user optimisation - add more users at once, multithreading
                if self.users_only:
                    self.bulk_user_only_load(max_id=max_id,
                                             cur_time=cur_time,
                                             priority=priority)

        logger.info(
            '[%02d, usr=%20s, remaining=%5s] Processed users link %s, Next since: %3s. ResQSize: %4d, '
            'LQSize: %4d, fill-up: %0.4f, priority: %4s, ram: %s kB, new=%s, New users: [%s]'
            % (self.local_data.idx, self.local_data.last_usr,
               self.local_data.last_remaining, len(github_users) + 1, max_id,
               self.resources_queue.qsize(), queue_size, fill_up_ratio,
               priority, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
               lucky_one, ', '.join([str(x.user_name) for x in github_users])))

        # Store all users.
        self.store_users_list(github_users)

    def bulk_user_only_load(self, max_id, cur_time, priority):
        """
        Fills the queue with multiple user links
        :param max_id:
        :param cur_time:
        :param priority:
        :return:
        """
        new_jobs = []
        for page_idx in range(1, self.users_bulk_load_pages + 1):
            users_url = self.USERS_URL % (max_id +
                                          self.users_per_page * page_idx)
            new_job = DownloadJob(url=users_url,
                                  jtype=DownloadJob.TYPE_USERS,
                                  time_added=cur_time,
                                  priority=priority - page_idx)
            new_jobs.append(new_job)

        self.since_id = max_id + self.users_per_page * self.users_bulk_load_pages
        logger.info('[%02d] Bulk load, new max=%s' %
                    (self.local_data.idx, self.since_id))

        for job in new_jobs:
            self.link_queue.put(job)

    def store_users_list(self, users):
        """
        Stores all user in the list
        :param users
        :return:
        """
        # Handling gaps in the user space ID. With user-only optimization it causes
        # overlaps.
        reduced_by = 0
        with self.processed_user_set_lock:
            ids = [user.user_id for user in users]
            ids_ok = []
            for id in ids:
                if id in self.processed_user_set:
                    reduced_by += 1
                    continue
                self.processed_user_set.add(id)
                ids_ok.append(id)
            users = [user for user in users if user.user_id in ids_ok]

        # Bulk user load
        s = self.session()
        id_list = sorted([user.user_id for user in users])
        db_users = s.query(GitHubUserDb).filter(
            GitHubUserDb.id.in_(id_list)).all()
        db_user_map = {user.id: user for user in db_users}

        for user in users:
            self.new_users_events.insert()

            # Store user to the DB
            try:
                db_user = utils.defvalkey(db_user_map, key=user.user_id)
                self.store_user(user, s, db_user=db_user, db_user_loaded=True)

            except Exception as e:
                logger.warning('[%02d] Exception in storing user %s' %
                               (self.local_data.idx, e))
                self.trace_logger.log(e)
                logger.info('[%02d] idlist: %s' %
                            (self.local_data.idx, id_list))
                self.trigger_quit()
                break

        try:
            s.commit()
            # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by))
        except Exception as e:
            logger.warning('[%02d] Exception in storing bulk users' %
                           self.local_data.idx)
            logger.warning(traceback.format_exc())
            logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list))
            self.trigger_quit()
        finally:
            utils.silent_close(s)

    def process_keys_data(self, job, js, headers, raw_response):
        """
        Processing key loaded data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        js_empty = js is None or len(js) == 0

        # Expect failures, commit everything before
        if self.merge and not js_empty:
            try:
                s = self.session()
                s.commit()
            except Exception as e:
                logger.warning('Could not pre-commit: %s' % e)

        # Store each key.
        key_ids = []
        for key in js:
            s = None
            self.new_keys_events.insert()

            try:
                s = self.session()
                self.store_key(job.user, key, s)
                key_ids.append(int(key['id']))
                s.commit()

                self.assoc_key(job.user.user_id, key['id'], s)
                s.commit()

                s.flush()  # writes changes to DB
                s.expunge_all()  # removes objects from session

            except Exception as e:
                logger.warning('Exception in storing key %s' % e)
                self.trace_logger.log(e)

            finally:
                utils.silent_close(s)
                s = None

        # Deassoc lost keys
        try:
            s = self.session()
            self.deassoc_lost_keys(job.user.user_id, key_ids, s)
            s.commit()

        except Exception as e:
            logger.warning('Exception in deassoc for users %s : %s' %
                           (job.user.user_id, e))
            self.trace_logger.log(e)

        finally:
            utils.silent_close(s)
            s = None

        self.on_keys_processed()

    def on_keys_processed(self):
        """
        Event called on keys have been processed.
        Currently used to re-fill the link queue in the update key scenario.
        :return:
        """
        if not self.update_keys:
            return

        with self.user_refill_lock:
            qsize = self.link_queue.qsize()
            if qsize < 30:
                self.fill_user_key_links()

    def store_user(self, user, s, db_user=None, db_user_loaded=False):
        """
        Stores username to the database.
        :param user:
        :return:
        """
        type_id = 0
        if user.user_type == 'User':
            type_id = 1
        elif user.user_type == 'Organization':
            type_id = 2

        try:
            if not db_user_loaded:
                db_user = s.query(GitHubUserDb).filter(
                    GitHubUserDb.id == user.user_id).one_or_none()
            if db_user is not None:
                db_user.date_last_check = salch.func.now()
                db_user.usr_type = type_id
                s.merge(db_user)
                return 0

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('User query problem: %s' % e)

        # Store a new user here
        try:
            db_user = GitHubUserDb()
            db_user.id = user.user_id
            db_user.username = user.user_name
            db_user.usr_type = type_id
            s.add(db_user)
            return 0

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('[%02d] Exception during user store: %s' %
                           (self.local_data.idx, e))
            if db_user_loaded:
                raise
            return 1

    def load_existing_key(self, key, s):
        """
        Loads existing key if exists
        :param key:
        :param s:
        :return:
        """
        key_id = int(key['id'])
        return s.query(GitHubKey).filter(GitHubKey.id == key_id).one_or_none()

    def store_key(self, user, key, s):
        """
        Stores user key to the database.
        :param user:
        :param key:
        :param s: current DB session
        :return:
        """

        # Loading phase
        existing_key = None
        try:
            if self.merge or self.update_keys:
                existing_key = self.load_existing_key(key, s)

        except Exception as e:
            logger.warning('Exception: %s' % e)

        # Storing phase
        try:
            if existing_key is not None:
                existing_key.date_last_check = salch.func.now()
                s.merge(existing_key)
                return existing_key

            key_id = int(key['id'])
            key_raw = key['key']

            key_type, key_val = [utils.strip(x) for x in key_raw.split(' ', 1)]

            db_key = GitHubKey()
            db_key.id = key_id
            db_key.key_id = key_id
            db_key.key_type = key_type
            db_key.key_user_found = user.user_name
            db_key.key_user_id_found = user.user_id
            db_key.text_raw = key_raw

            if key_type == 'ssh-rsa':
                try:
                    key_obj = utils.load_ssh_pubkey(key_raw)
                    if isinstance(key_obj, RSAPublicKey):
                        db_key.key_size = key_obj.key_size
                        numbers = key_obj.public_numbers()
                        db_key.key_modulus_hex = '%x' % numbers.n
                        db_key.key_exponent = numbers.e
                except Exception as e:
                    logger.info('Exception during processing the key[%s]: %s' %
                                (key_type, e))

            s.add(db_key)
            return db_key

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key store: %s' % e)
            return 1

    def assoc_key(self, user_id, key_id, s):
        """
        Association user <-> key
        :param user_id:
        :param key_id:
        :param s:
        :return:
        """
        try:
            uassoc = None
            if self.merge:
                uassoc = s.query(GitHubUserKeys)\
                    .filter(GitHubUserKeys.user_id == user_id)\
                    .filter(GitHubUserKeys.key_id == key_id)\
                    .one_or_none()

            if uassoc is not None:
                uassoc.lost_at = None
                return

            if uassoc is None:
                uassoc = GitHubUserKeys()
                uassoc.user_id = user_id
                uassoc.key_id = key_id
                uassoc.fount_at = salch.func.now()
                uassoc.lost_at = None
                s.add(uassoc)

            return 0

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key assoc: %s' % e)
            return 1

    def deassoc_lost_keys(self, user_id, key_id, s):
        """
        Mark all lost keys as lost for the given user (not in key_id)
        :param user_id:
        :param key_id:
        :param s:
        :return:
        """
        try:
            q = s.query(GitHubUserKeys)\
                .filter(GitHubUserKeys.user_id == user_id)

            if len(key_id) > 0:
                q = q.filter(GitHubUserKeys.key_id.notin_(key_id))

            q.update({GitHubUserKeys.lost_at: salch.func.now()},
                     synchronize_session='fetch')
            return 0

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key deassoc: %s' % e)
            return 1

    def flush_state(self):
        """
        Flushes state/config to the state file
        :return:
        """
        self.state['since_id'] = self.since_id
        self.state['rate_limit_remaining'] = self.rate_limit_remaining
        self.state['rate_limit_reset'] = self.rate_limit_reset
        utils.flush_json(self.state, self.state_file_path)

    #
    # DB
    #

    def fill_user_key_links(self):
        """
        Loads next X users from the database, advances since_id
        :return:
        """
        # self.since_id
        s = self.session()
        try:
            db_users = s.query(GitHubUserDb)\
                .filter(GitHubUserDb.id > self.since_id)\
                .order_by(GitHubUserDb.id)\
                .limit(self.user_load_bulk)\
                .all()

            for user in db_users:
                key_url = self.KEYS_ID_URL % user.id
                github_user = GitHubUser(user_id=user.id,
                                         user_name=user.username,
                                         user_type=user.usr_type,
                                         user_url=self.USER_URL %
                                         user.username)
                new_job = DownloadJob(url=key_url,
                                      jtype=DownloadJob.TYPE_KEYS,
                                      user=github_user,
                                      priority=random.randint(0, 1000),
                                      time_added=time.time())
                self.link_queue.put(new_job)

                if user.id > self.since_id:
                    self.since_id = user.id

        except Exception as e:
            logger.warning('Exception in loading users: %s' % e)
            utils.silent_rollback(s)

        finally:
            utils.silent_close(s)

    #
    # Auditing - errors, problems for further analysis
    #

    def audit_log(self, evt=None, link=None, jtype=None, job=None):
        """
        Appends audit log to the buffer. Lock protected.
        :param evt:
        :param link:
        :return:
        """
        log = collections.OrderedDict()
        log['time'] = time.time()
        log['evt'] = evt
        log['jtype'] = jtype
        log['link'] = link

        if job is not None and isinstance(job, DownloadJob):
            log['job'] = job.to_json()

        with self.audit_lock:
            self.audit_records_buffered.append(log)

    def flush_audit(self):
        """
        Flushes audit logs to the JSON append only file.
        Routine protected by the lock (no new audit record can be inserted while holding the lock)
        :return:
        """
        if self.audit_file is None:
            self.audit_records_buffered = []
            return

        self.audit_lock.acquire()
        try:
            if len(self.audit_records_buffered) == 0:
                return
            with open(self.audit_file, 'a') as fa:
                for x in self.audit_records_buffered:
                    fa.write(json.dumps(x) + "\n")
            self.audit_records_buffered = []
        except Exception as e:
            logger.error('Exception in audit log dump %s' % e)
        finally:
            self.audit_lock.release()

    #
    # State save / resume
    #

    def state_main(self):
        """
        State thread - periodical dump of the queues.
        :return:
        """
        logger.info('State thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))
        try:
            while not self.stop_event.is_set() and not self.terminate:
                # Dump stats each x seconds
                # Sleep is here because of dumping the state for the last time just before program quits.
                self.interruptible_sleep_delta(2)
                self.state_save()

                # Check memory conditions
                self.state_ram_check()

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s' % e)

        finally:
            pass

        logger.info('State loop terminated')

    def state_ram_check(self):
        """
        Checks memory terminating conditions
        :return:
        """

        if self.max_mem is None:
            return

        cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if cur_ram <= self.max_mem:
            return

        logger.warning(
            'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB'
            % (cur_ram, cur_ram / 1024.0, self.max_mem))
        self.trigger_stop()

    def state_gen(self):
        """
        Dumps state
        :return:
        """
        try:
            js_q = collections.OrderedDict()
            js_q['gen'] = time.time()
            js_q['link_size'] = self.link_queue.qsize()
            js_q['since_id'] = self.since_id
            js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

            # Dequeues
            self.new_users_events.maintain()
            self.new_keys_events.maintain()

            users_in_5min = self.new_users_events.under_limit(5 * 60)
            keys_in_5min = self.new_keys_events.under_limit(5 * 60)

            js_q['users_dequeue_size'] = self.new_users_events.len()
            js_q['keys_dequeue_size'] = self.new_keys_events.len()
            js_q['users_5min'] = users_in_5min
            js_q['keys_5min'] = keys_in_5min
            js_q['users_1min'] = users_in_5min / 5.0
            js_q['keys_1min'] = keys_in_5min / 5.0

            # link queue structure
            qdata = list(self.link_queue.queue)
            qdata.sort(cmp=DownloadJob.cmp)
            js_q['link_structure'] = ''.join([
                '.' if x.type == DownloadJob.TYPE_KEYS else 'U' for x in qdata
            ])

            # Stats.
            js_q['resource_stats'] = [
                x.to_json() for x in list(self.resources_list)
            ]

            # Finally - the queue
            js_q['link_queue'] = [x.to_json() for x in qdata]
            return js_q

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s', e)

    def state_save(self):
        """
        saves the state
        :return:
        """
        try:
            js_q = self.state_gen()
            utils.flush_json(js_q, self.state_file_path)

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s', e)

    def state_resume(self):
        """
        Attempts to resume the queues from the monitoring files
        :return:
        """
        try:
            if self.state is None:
                return

            if 'since_id' in self.state:
                self.since_id = self.state['since_id']

            if 'link_queue' in self.state:
                for rec in self.state['link_queue']:
                    job = DownloadJob.from_json(rec)
                    self.link_queue.put(job)
                logger.info('Link queue resumed, entries: %d' %
                            len(self.state['link_queue']))

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('Exception in resuming the state %s' % e)
            logger.error('State resume failed, exiting')
            sys.exit(1)
Example #12
0
def test_functionality():
    n_tests_passed = 0
    round = 0

    while round < 50:  # multiple runs required to check for memory leaks
        if round == 0:
            tracker = SummaryTracker()

        round += 1

        a = GenericSystem.new()
        b = a.Plant

        # Test setting values with correct types

        a.Plant.derate = 1
        assert (a.Plant.derate == 1)
        assert (b.derate == 1)

        n_tests_passed += 1

        b.energy_output_array = (1, 2)
        assert (a.Plant.energy_output_array == (1, 2))
        assert (b.energy_output_array == (1, 2))
        n_tests_passed += 1

        # Test type checks with errors

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = 1
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = (1, "2")
        except:
            n_tests_passed += 1

        # Test assigning from dictionary

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        b.assign(PlantDict)
        assert (b.derate == 10)
        assert (b.energy_output_array == (10, 20))
        n_tests_passed += 1

        PlantDict = {'derate': 1, 'energy_output_array': (2, 2)}
        a.Plant.assign(PlantDict)
        assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2))
        n_tests_passed += 1

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ()
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ((12, 20), (1, 1))
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['derate'] = "derate"
            PlantDict['energy_output_array'] = (1, 2)
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        # exporting to dictionary

        ValDict = b.export()
        assert (ValDict['derate'] == 1
                and ValDict['energy_output_array'] == (2, 2))
        n_tests_passed += 1

        # Test shared module (AdjustmentFactors)
        d = a.AdjustmentFactors

        d.constant = 1
        assert (d.constant == 1)
        n_tests_passed += 1

        d.hourly = (1, 2)
        assert (d.hourly == (1, 2))
        n_tests_passed += 1

        d.periods = ((1, 2), (3, 4))
        assert (d.periods == ((1, 2), (3, 4)))
        n_tests_passed += 1

        try:
            d.periods = ((1, 2))
        except:
            n_tests_passed += 1

        ValDict = d.export()
        assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2)
                and ValDict['periods'] == ((1, 2), (3, 4)))
        n_tests_passed += 1

        ValDict = {
            'constant': 10,
            "hourly": (10, 20),
            "periods": ((10, 20), (30, 40))
        }
        d.assign(ValDict)
        assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20)
                and ValDict['periods'] == ((10, 20), (30, 40)))
        n_tests_passed += 1

        # Test nested dictionary assignment and export

        TechDict = {
            'Plant': {
                'derate': 100,
                'energy_output_array': (100, 200)
            },
            'AdjustmentFactors': {
                'constant': 100,
                "hourly": (100, 200),
                "periods": ((100, 200), (300, 400))
            }
        }
        a.assign(TechDict)
        ValDict = a.Plant.export()
        assert (ValDict['derate'] == 100
                and ValDict['energy_output_array'] == (100, 200))
        n_tests_passed += 1

        ValDict = a.AdjustmentFactors.export()
        assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200)
                and ValDict['periods'] == ((100, 200), (300, 400)))
        n_tests_passed += 1

        # Test reading from PySSC

        data = ssc.data_create()
        ssc.data_set_number(data, b'derate', 1000)
        ssc.data_set_array(data, b'energy_output_array', [1000, 2000])
        a = GenericSystem.wrap(data)
        assert (a.Plant.derate == 1000)
        assert (a.Plant.energy_output_array == (1000, 2000))

        # Test strings and tables with error cases
        import PySAM.Pvwattsv5 as Pvwattsv5
        a = Pvwattsv5.new()
        a.SolarResource.solar_resource_file = "file"
        assert (a.SolarResource.solar_resource_file == "file")
        n_tests_passed += 1

        assert (a.SolarResource.export()['solar_resource_file'] == 'file')
        n_tests_passed += 1

        c = Pvwattsv5.new()
        datDict = {
            'num': 1,
            'arr': (1, 2),
            'str': 'str',
            'mat': ((1, 2), (3, 4)),
            'table': {
                'yo': 0
            }
        }
        c.SolarResource.solar_resource_data = datDict
        DataDict = c.SolarResource.solar_resource_data
        assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2))
        assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0)))
        assert (DataDict['str'] == 'str')
        assert (DataDict['table'] == {'yo': 0})
        n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_file = 100
        except:
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_data = {
                'num': 1,
                'arr': (1, "2"),
                'mat': ((1, 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
        except:
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_data = {
                'num': 1,
                'arr': (1, 2),
                'mat': (("1", 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
        except:
            n_tests_passed += 1

        a.SolarResource.solar_resource_data = {
            'num': 1,
            'arr': (1, 2),
            'mat': ((1, 2), (3, 4)),
            'str': 'str',
            'table': {}
        }
        assert (a.SolarResource.solar_resource_data['table'] == {})
        n_tests_passed += 1

        # Test conversion between technology attributes and nested dictionary

        genDict = a.export()
        assert (
            genDict['SolarResource']['solar_resource_data']['str'] == 'str')
        n_tests_passed += 1

        a = Pvwattsv5.new()
        assert (a.export()['SolarResource'] == {})
        a.assign(genDict)
        assert (a.export() == genDict)
        n_tests_passed += 1

        # Test loading from serialized dict
        a = GenericSystem.default("GenericSystemNone")

        # Test `value` function
        a.value("derate", 1)
        assert (a.value("derate") == 1)

        a.value("energy_output_array", (0, 1, 2))
        assert (a.value("energy_output_array")[0] == 0)
        assert (a.value("energy_output_array")[1] == 1)
        assert (a.value("energy_output_array")[2] == 2)

        if round == 3:
            tracker.print_diff()

    tracker.print_diff()
Example #13
0
def test_functionality():
    n_tests_passed = 0
    round = 0

    while round < 50:  # multiple runs required to check for memory leaks
        if round == 0:
            tracker = SummaryTracker()

        print("\n===Starting Round", round, "===\n\n")
        round += 1

        a = GenericSystem.new()
        b = a.Plant

        # Test setting values with correct types

        a.Plant.derate = 1
        assert (a.Plant.derate == 1)
        assert (b.derate == 1)

        print("Passed test", 0)
        n_tests_passed += 1

        b.energy_output_array = (1, 2)
        assert (a.Plant.energy_output_array == (1, 2))
        assert (b.energy_output_array == (1, 2))
        print("Passed test", 1)
        n_tests_passed += 1

        # Test type checks with errors

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = 1
            print("FAIL 2: exception is expected")
        except:
            print("Passed test", 2)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = (1, "2")
            print("FAIL 3: exception is expected")
        except:
            print("Passed test", 3)
            n_tests_passed += 1

        # Test assigning from dictionary

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        b.assign(PlantDict)
        assert (b.derate == 10)
        assert (b.energy_output_array == (10, 20))
        print("Passed test", 4)
        n_tests_passed += 1

        PlantDict = {'derate': 1, 'energy_output_array': (2, 2)}
        a.Plant.assign(PlantDict)
        assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2))
        print("Passed test", 5)
        n_tests_passed += 1

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ()
            c.Plant.assign(PlantDict)
            print("FAIL 1: exception is expected")
        except:
            print("Error caught", 1)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ((12, 20), (1, 1))
            c.Plant.assign(PlantDict)
            print("FAIL 2: exception is expected")
        except:
            print("Error caught", 2)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['derate'] = "derate"
            PlantDict['energy_output_array'] = (1, 2)
            c.Plant.assign(PlantDict)
            print("FAIL 3: exception is expected")
        except:
            print("Error caught", 3)
            n_tests_passed += 1

        # exporting to dictionary

        ValDict = b.export()
        assert (ValDict['derate'] == 1
                and ValDict['energy_output_array'] == (2, 2))
        print("Passed test", 6)
        n_tests_passed += 1

        # Test shared module (AdjustmentFactors)
        d = a.AdjustmentFactors

        d.constant = 1
        assert (d.constant == 1)
        print("Passed test", 7)
        n_tests_passed += 1

        d.hourly = (1, 2)
        assert (d.hourly == (1, 2))
        print("Passed test", 8)
        n_tests_passed += 1

        d.periods = ((1, 2), (3, 4))
        assert (d.periods == ((1, 2), (3, 4)))
        print("Passed test", 9)
        n_tests_passed += 1

        try:
            d.periods = ((1, 2))
            print("FAIL 4: exception is expected")
        except:
            print("Error caught", 4)
            n_tests_passed += 1

        ValDict = d.export()
        assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2)
                and ValDict['periods'] == ((1, 2), (3, 4)))
        print("Passed test", 10)
        n_tests_passed += 1

        ValDict = {
            'constant': 10,
            "hourly": (10, 20),
            "periods": ((10, 20), (30, 40))
        }
        d.assign(ValDict)
        assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20)
                and ValDict['periods'] == ((10, 20), (30, 40)))
        print("Passed test", 11)
        n_tests_passed += 1

        # Test nested dictionary assignment and export

        TechDict = {
            'Plant': {
                'derate': 100,
                'energy_output_array': (100, 200)
            },
            'AdjustmentFactors': {
                'constant': 100,
                "hourly": (100, 200),
                "periods": ((100, 200), (300, 400))
            }
        }
        a.assign(TechDict)
        ValDict = a.Plant.export()
        assert (ValDict['derate'] == 100
                and ValDict['energy_output_array'] == (100, 200))
        print("Passed test", 12)
        n_tests_passed += 1

        ValDict = a.AdjustmentFactors.export()
        assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200)
                and ValDict['periods'] == ((100, 200), (300, 400)))
        print("Passed test", 13)
        n_tests_passed += 1

        # Test reading from PySSC

        data = ssc.data_create()
        ssc.data_set_number(data, b'derate', 1000)
        ssc.data_set_array(data, b'energy_output_array', [1000, 2000])
        a = GenericSystem.wrap(data)
        assert (a.Plant.derate == 1000)
        assert (a.Plant.energy_output_array == (1000, 2000))

        # Test strings and tables with error cases
        import PySAM.Pvwattsv5 as Pvwattsv5
        a = Pvwattsv5.new()
        a.LocationAndResource.solar_resource_file = "file"
        assert (a.LocationAndResource.solar_resource_file == "file")
        print("Passed test", 14)
        n_tests_passed += 1

        assert (
            a.LocationAndResource.export()['solar_resource_file'] == 'file')
        print("Passed test", 15)
        n_tests_passed += 1

        c = Pvwattsv5.new()
        datDict = {
            'num': 1,
            'arr': (1, 2),
            'str': 'str',
            'mat': ((1, 2), (3, 4)),
            'table': {
                'yo': 0
            }
        }
        c.LocationAndResource.solar_resource_data = datDict
        DataDict = c.LocationAndResource.solar_resource_data
        assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2))
        assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0)))
        assert (DataDict['str'] == 'str')
        assert (DataDict['table'] == {'yo': 0})
        print("Passed test", 16)
        n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_file = 100
            print("FAIL 5: exception is expected")
        except:
            print("Error caught", 5)
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_data = {
                'num': 1,
                'arr': (1, "2"),
                'mat': ((1, 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
            print("FAIL 6: exception is expected")
        except:
            print("Error caught", 6)
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_data = {
                'num': 1,
                'arr': (1, 2),
                'mat': (("1", 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
            print("FAIL 7: exception is expected")
        except:
            print("Error caught", 7)
            n_tests_passed += 1

        a.LocationAndResource.solar_resource_data = {
            'num': 1,
            'arr': (1, 2),
            'mat': ((1, 2), (3, 4)),
            'str': 'str',
            'table': {}
        }
        assert (a.LocationAndResource.solar_resource_data['table'] == {})
        print("Passed test", 17)
        n_tests_passed += 1

        # Test conversion between technology attributes and nested dictionary

        genDict = a.export()
        assert (genDict['LocationAndResource']['solar_resource_data']['str'] ==
                'str')
        print("Passed test", 18)
        n_tests_passed += 1

        a = Pvwattsv5.new()
        assert (a.export()['LocationAndResource'] == {})
        a.assign(genDict)
        assert (a.export() == genDict)
        print("Passed test", 19)
        n_tests_passed += 1

        # Test loading from serialized dict
        a = GenericSystem.default("GenericSystemNone")
        print(a.export())

        if round == 3:
            tracker.print_diff()

        # execution
        # a.execute(1)

    tracker.print_diff()
def eRisk_testT2_runs_2(writingDir, DecisionDir):
    PATH = Path(writingDir)
    CLAS_PATH = 'data/nlp_clas/eRisk_dep/eRisk_dep_clas'

    CLASSES = ['neg', 'pos']

    lm_id = 'eRisk_dep4_FT'
    clas_id = 'eRisk_dep4_2_BEST'

    LM_PATH = "data/nlp_clas/eRisk_dep/eRisk_dep_lm"

    val_texts, subjIDs, chunkIDs, wrIDs = get_texts_final(PATH)

    df_val_ALL = pd.DataFrame({
        'text': val_texts,
        'chunk_No': list(map(int, chunkIDs)),
        'wr_No': list(map(int, wrIDs)),
        'subj_ID': subjIDs
    })
    df_val_ALL = df_val_ALL.sort_values(by=['subj_ID', 'chunk_No', 'wr_No'],
                                        ascending=[True, True, True])
    subjIDs_unq = set(subjIDs)

    df_val_ALL_proc = df_val_ALL[0:0]
    for sId in subjIDs_unq:
        #print(sId)
        df_subj = df_val_ALL.loc[df_val_ALL['subj_ID'] == sId]
        df_subj_proc = ProcessAcc(df_subj, 5)
        df_val_ALL_proc = df_val_ALL_proc.append(df_subj_proc)

    print('Finish [ProcessAcc].....' + str(max(map(int, wrIDs))))

    val_texts = [
        it[0] for it in df_val_ALL_proc.loc[:, ['text']].values.tolist()
    ]
    #val_labels = [it[0] for it in df_val_ALL_proc.loc[:, ['labels']].values.tolist()]
    subjIDs = [
        it[0] for it in df_val_ALL_proc.loc[:, ['subj_ID']].values.tolist()
    ]
    chunkIDs = [
        it[0] for it in df_val_ALL_proc.loc[:, ['chunk_No']].values.tolist()
    ]
    wrIDs = [it[0] for it in df_val_ALL_proc.loc[:, ['wr_No']].values.tolist()]

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy', val_texts)
    ##np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy', val_labels)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy', subjIDs)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy', chunkIDs)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy', wrIDs)
    #
    #val_texts = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy')
    ##val_labels = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy')
    #subjIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy')
    #chunkIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy')
    #wrIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy')
    #
    col_names = ['labels', 'text']

    df_val = pd.DataFrame({
        0: np.zeros(len(val_texts)),
        1: val_texts
    })  # columns=col_names)
    #df_val = pd.DataFrame({'text':val_texts, 'labels':np.zeros(len(val_texts))}, columns=col_names)
    #writeToCSV_oneFiled(df_val, CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/test_Real1_wr.csv')
    #
    #
    ##chunksize = 24000
    #chunksize = 24000
    #
    #
    ##df_val1 = pd.read_csv(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'test_Real1_wr.csv', header=None, chunksize=chunksize, engine='python')#,names=(range(227)))

    #tok_val, val_labels = get_all_eRisk(df_val1, 1)

    gc.collect()

    tracker = SummaryTracker()
    tracker.print_diff()

    tok_val, val_labels = get_texts_eRisk(df_val, 1)

    tracker = SummaryTracker()
    tracker.print_diff()

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy', tok_val)

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'lbl_val.npy', val_labels)

    #tok_val = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy')
    itos = pickle.load(open(LM_PATH + '/' + 'tmp' + '/' + 'itos.pkl', 'rb'))
    stoi = collections.defaultdict(lambda: 0,
                                   {v: k
                                    for k, v in enumerate(itos)})
    len(itos)

    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])

    np.save(
        CLAS_PATH + '/' + 'tmp' + '/' + 'tests' + '/' + 'testT2' + '/' +
        'val_ids.npy', val_clas)

    #create_bw_data_test(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'testT2'+'/', LM_PATH+'/'+'tmp'+'/'+'itos.pkl')

    print('Start running the models .... ')

    prediction, samplIdx, pos_prob = eval_clas_eRisk_final_3(str(CLAS_PATH),
                                                             2,
                                                             lm_id=lm_id,
                                                             clas_id=clas_id,
                                                             attention=False)
    #prediction_bw, samplIdx_bw, pos_prob_bw = eval_clas_eRisk_final_3(str(CLAS_PATH), 2, lm_id=lm_id, clas_id=clas_id, attention=False, backwards=True)

    #prediction = [1 if (it_fw+it_bw)/2 > 0.5 else 0 for it_fw, it_bw in zip(prediction_fw, prediction_bw)]
    #samplIdx = samplIdx_fw
    #pos_prob = [(it_fw+it_bw)/2 for it_fw, it_bw in zip(pos_prob_fw, pos_prob_bw)]

    tracker = SummaryTracker()
    tracker.print_diff()

    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy', prediction)
    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy', samplIdx)
    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy', pos_prob)
    #
    #prediction = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy')
    #samplIdx = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy')
    #pos_prob = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy')
    #

    subjIDs = [subjIDs[i] for i in samplIdx]
    #val_labels = [val_labels[i] for i in samplIdx]
    chunkIDs = [chunkIDs[i] for i in samplIdx]
    wrIDs = [wrIDs[i] for i in samplIdx]

    uniq_subID = list(set(subjIDs))
    #results_pred = [[ None ] * 10] * len(uniq_subID)
    results_pred = [[0] * 10 for _ in range(len(uniq_subID))]
    results_pred_thr = [[0] * 10 for _ in range(len(uniq_subID))]
    totalWrs = [[0] * 10 for _ in range(len(uniq_subID))]
    #results_golden = [ None ] * len(uniq_subID)

    pos_prob_wr = [[] for _ in range(len(uniq_subID))]

    #checkIntegrity(subjIDs,val_labels)
    th = 0.9

    for sid, chNum, wr, pred, pos_p in zip(subjIDs, chunkIDs, wrIDs,
                                           prediction, pos_prob):
        idx = uniq_subID.index(sid)
        pos_prob_wr[idx].append(pos_p)
        results_pred[idx][int(chNum) -
                          1] = results_pred[idx][int(chNum) - 1] + pred

        if (pos_p >= th):
            results_pred_thr[idx][int(chNum) -
                                  1] = results_pred_thr[idx][int(chNum) -
                                                             1] + 1

        totalWrs[idx][int(chNum) - 1] = max(totalWrs[idx][int(chNum) - 1],
                                            int(wr))

    #res = [[res/total for res,total in zip(results_pred[i], totalWrs[i])] for i in range(len(results_pred))]

    #print("F1: " + str(f1_erisk(val_labels, prediction)))

    pos_p_avg = [sum(pb) / len(pb) for pb in pos_prob_wr]

    #for thr in np.arange(0.0, 1.0, 0.025):
    #    print(str(thr) + ":\t")
    #    prd = [1 if(it >= thr) else 0 for it in pos_p_avg]
    #    #print("F1: " + str(f1_erisk(prd, results_golden)))
    #    f1_erisk(prd, results_golden)
    #

    pred_count = [sum(rb) for rb in results_pred_thr]

    createDecisionFiles(uniq_subID, pred_count, pos_prob_wr, DecisionDir,
                        max(map(int, wrIDs)))
    gc.collect()
Example #15
0
    def _start_session(self):
        if self._debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()
        if self._debugging_level >= 2:
            from pympler.tracker import SummaryTracker
            tracker = SummaryTracker()

        # GC could cause unplanned stall and disrupts precisely timed collection.
        # Disable it and do it manually before sleeping.
        gc.disable()
        if self._tick_len == 0:
            self._force_collect = True
        try:
            self._logger.info('Client node {node_name} started'.format(
                node_name=self._node_name))
            while not self._stopped:
                if self._status == ClientStatus.OFFLINE:
                    self._status = ClientStatus.HANDSHAKE1_AUTHENTICATING
                    # Handshake
                    self.timestamp_and_send_list([
                        ProtocolCode.KEY, self._api_secret_key,
                        self._cluster_name, self._node_name
                    ])
                    # Reset the timeout counter after sending out a command
                    self._last_received_ts = monotonic_time()
                    self._current_error_msg = 'Failed to connect to the gateway.'
                    self._logger.info(
                        'Client node {node_name} initiated handshaking. Step 1: authenticating...'
                        .format(node_name=self._node_name))

                if self._status == ClientStatus.ALL_OK:
                    if self._getters is not None and len(self._getters) > 0:
                        # We need to collect at a fix time in a second (self._collect_time_decimal) by
                        # the wall clock (so that the wall clock time will be saved to the database for
                        # other analyses), so we don't use monotonic clock (we tried using monotonic
                        # time with some complex calculation in the past but it was too buggy).
                        ts = time.time()
                        # Time backflow detection: because we are not using monotonic time, if the user
                        # turns back the wall time, the program might not do collection in a long time.
                        if self._last_collect_time - ts > self._tick_len > 0:
                            self._logger.info(
                                'Time was turned backwards. Doing a collection right now.'
                            )
                            self._last_collect_time = -1
                            self._update_next_collect_time()
                        # When self._tick_len > 0, we do a collect step at self._next_collect_time.
                        # When self._tick_len == 0, we wait until force_collect.
                        if (self._tick_len > 0 and ts >= self._next_collect_time) or \
                           (self._tick_len == 0 and self._force_collect):
                            self._logger.debug(
                                'Time is reached to do collection. Starting...'
                            )
                            # requested_collect_time will be sent to getters to indicate the time of
                            # data we want to collect. We get it before updating the next_collect_time.
                            requested_collect_time = self._next_collect_time
                            # last_collect_time and next_collect_time must be updated *before* collecting
                            # to prevent skipping of a collection if the collection is longer than a tick_len
                            if self._tick_len > 0:
                                self._last_collect_time = ts
                                self._update_next_collect_time()
                            else:
                                # When tick_len == 0, we have to use a increasing counter instead of
                                # the read collect time to prevent collision.
                                self._last_collect_time += 1
                                self._force_collect = False
                            pi_data = []
                            for g in self._getters:
                                self._logger.debug(
                                    "Starting to collect data from getter '{getter_name}'"
                                    .format(getter_name=g.name))
                                d = g.collect(self._tick_len,
                                              requested_collect_time)
                                if d is None or len(d) == 0:
                                    self._logger.warning(
                                        "Client node {node_name} getter '{getter_name}' did not return"
                                        " any data".format(
                                            node_name=self._node_name,
                                            getter_name=g.name))
                                else:
                                    self._logger.debug(
                                        "Collected data from getter '{getter_name}': {data}"
                                        .format(getter_name=g.name,
                                                data=str(d)))
                                    pi_data.extend(d)
                            if len(pi_data) == 0:
                                self._logger.info(
                                    'Client node {node_name} all getters return no data. Skipped sending.'
                                    .format(node_name=self._node_name))
                            else:
                                self._logger.debug(
                                    'Client node {node_name} collected from all getters: {pi_data}'
                                    .format(node_name=self._node_name,
                                            pi_data=str(pi_data)))

                                if self._tuning_goal_calculator is None:
                                    tuning_goal_payload = []
                                    self._logger.debug(
                                        'Client node {node_name} has no tuning goal'
                                        .format(node_name=self._node_name))
                                else:
                                    tuning_goal = self._tuning_goal_calculator.get_tuning_goal(
                                        pi_data)
                                    assert -1 <= tuning_goal <= 1
                                    tuning_goal_payload = [tuning_goal]
                                    self._logger.debug(
                                        'Client node {node_name} collected tuning goal: {tuning_goal}'
                                        .format(node_name=self._node_name,
                                                tuning_goal=str(tuning_goal)))

                                # First element of the outgoing list is tuning_goal
                                self.timestamp_and_send_list(
                                    [
                                        ProtocolCode.PI,
                                        tuning_goal_payload + pi_data
                                    ],
                                    ts=self._last_collect_time)
                                # We don't wait for 'OK' to save time
                        else:
                            self._logger.debug(
                                'Collection time is not reached yet')
                            pass
                    else:
                        self._logger.debug(
                            'No getter is set. Skipped collecting.')
                        self._last_collect_time = time.time()

                gc.collect()
                flush_log()

                # Print out memory usage every minute
                if self._debugging_level >= 2 and int(time.time()) % 60 == 0:
                    print('Time: ' + time.asctime(time.localtime(time.time())))
                    tracker.print_diff()

                if self._status == ClientStatus.ALL_OK and self._getters is not None and len(self._getters) > 0 and \
                        self._tick_len > 0:
                    # Calculate the precise time for next collection
                    sleep_second = max(self._next_collect_time - time.time(),
                                       0)
                else:
                    sleep_second = 1

                # We have to process all messages in the queue before doing next collection,
                # otherwise, a lengthy collection above could prevent the messages in msg_queue from
                # being processed.
                self._process_all_messages(sleep_second)

            if self.notify_gateway_on_stop:
                self.timestamp_and_send_list([ProtocolCode.CLIENT_STOP])
            self._logger.info('Client node {node_name} stopped'.format(
                node_name=self._node_name))
        finally:
            self._status = ClientStatus.OFFLINE
            gc.enable()

            if self._debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())
Example #16
0
 def do_mem_track_init(self, line):
     self.mem_tracker = SummaryTracker()
Example #17
0
class GitHubLoader(Cmd):
    """
    GitHub SSH keys loader
    """
    prompt = '$> '

    LINK_FACTOR = 70
    USERS_URL = 'https://api.github.com/users?since=%s'
    KEYS_URL = 'https://api.github.com/users/%s/keys'

    USER_DETAIL_URL = 'https://api.github.com/users/%s'
    USER_REPOS_URL = 'https://api.github.com/users/%s/repos'
    USER_ORGS_URL = 'https://api.github.com/users/%s/orgs'
    ORG_REPOS_URL = 'https://api.github.com/orgs/%s/repos'
    ORG_REPO_COLAB_URL = 'https://api.github.com/repos/%s/collaborators'
    ORG_REPO_ASSIGNEES_URL = 'https://api.github.com/repos/%s/assignees'

    def __init__(self,
                 attempts=5,
                 threads=1,
                 state=None,
                 state_file=None,
                 config_file=None,
                 audit_file=None,
                 max_mem=None,
                 *args,
                 **kwargs):

        Cmd.__init__(self, *args, **kwargs)
        self.t = Terminal()

        self.args = None
        self.attempts = int(attempts)
        self.total = None
        self.terminate = False

        self.last_users_count = None
        self.user_lock = Lock()
        self.processed_user_set = set()
        self.processed_user_set_lock = Lock()
        self.orgs_loaded_set = set()
        self.orgs_loaded_lock = Lock()

        self.max_mem = max_mem
        self.state = state
        self.state_file_path = state_file
        self.rate_limit_reset = None
        self.rate_limit_remaining = None

        self.config = None
        self.config_file = config_file

        self.audit_file = audit_file
        self.audit_records_buffered = []
        self.audit_lock = Lock()

        self.stop_event = threading.Event()
        self.threads = int(threads)
        self.link_queue = Queue.PriorityQueue()  # Store links to download here
        self.worker_threads = []

        self.state_thread = None
        self.state_thread_lock = Lock()

        self.resources_list = []
        self.resources_queue = Queue.PriorityQueue()
        self.local_data = threading.local()

        self.new_users_events = EvtDequeue()
        self.new_keys_events = EvtDequeue()

        self.db_config = None
        self.engine = None
        self.session = None

        self.mem_tracker = None

    def signal_handler(self, signal, frame):
        """
        Signal handler - terminate gracefully
        :param signal:
        :param frame:
        :return:
        """
        logger.info('CTRL+C pressed')
        self.trigger_stop()

    def trigger_stop(self):
        """
        Sets terminal conditions to true
        :return:
        """
        self.terminate = True
        self.stop_event.set()

    def trigger_quit(self):
        """
        Terminal condition & file change
        :return:
        """
        self.trigger_stop()
        utils.try_touch('.github-quit')

    def do_quit(self, arg):
        self.trigger_quit()
        logger.info('Waiting for thread termination')

        time.sleep(1)
        logger.info('Quitting')
        return Cmd.do_quit(self, arg)

    def do_reset(self, line):
        print('\033c')

    def do_gc(self, line):
        gc.collect()

    def do_mem_top(self, line):
        print(mem_top.mem_top())

    def do_mem_track_init(self, line):
        self.mem_tracker = SummaryTracker()

    def do_mem_track_diff(self, line):
        print(self.mem_tracker.print_diff())

    def do_mem_track_deinit(self, line):
        self.mem_tracker = None

    def do_mem(self, line):
        print('Memory usage: %s kB' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    def do_state(self, line):
        js = self.state_gen()

        if line is None or len(line) == 0:
            del js['link_queue']
            del js['resource_stats']
        elif line == '1':
            del js['link_queue']

        print(json.dumps(js, indent=2, cls=utils.AutoJSONEncoder))

    def do_deq_enable(self, line):
        self.new_keys_events.disabled = False
        self.new_users_events.disabled = False

    def do_deq_disable(self, line):
        self.new_keys_events.disabled = True
        self.new_users_events.disabled = True

    def init_config(self):
        """
        Loads config & state files
        :return:
        """
        if self.state_file_path is not None and os.path.exists(
                self.state_file_path):
            with open(self.state_file_path, 'r') as fh:
                self.state = json.load(fh, object_pairs_hook=OrderedDict)
                logger.info('State loaded: %s' %
                            os.path.abspath(self.state_file_path))

        with open(self.config_file, 'r') as fh:
            self.config = json.load(fh, object_pairs_hook=OrderedDict)
            logger.info('Config loaded: %s' %
                        os.path.abspath(self.config_file))

            # Process resources - randomized
            if 'res' in self.config:
                res_tmp = self.config['res']
                random.shuffle(res_tmp)
                for idx, res in enumerate(res_tmp):
                    r = AccessResource(usr=res['usr'],
                                       token=res['token'],
                                       idx=idx)
                    self.resources_list.append(r)
                    self.resources_queue.put(r)
                    logger.info('Resource %02d loaded: %s' % (idx, r.usr))
            else:
                # unauth
                r = AccessResource(usr=None, token=None)
                self.resources_list.append(r)
                self.resources_queue.put(r)

    def init_db(self):
        """
        Initializes database engine & session.
        Has to be done on main thread.
        :return:
        """
        self.db_config = databaseutils.process_db_config(self.config['db'])

        from sqlalchemy import create_engine
        from sqlalchemy.orm import sessionmaker, scoped_session
        self.engine = create_engine(self.db_config.constr, pool_recycle=3600)
        self.session = scoped_session(sessionmaker(bind=self.engine))

        # Make sure tables are created
        DB_Base.metadata.create_all(self.engine)

    def init_workers(self):
        """
        Initialize worker threads
        :return:
        """
        logger.info('Starting %d working threads' % self.threads)
        for idx in range(self.threads):
            t = threading.Thread(target=self.work_thread_main, args=(idx, ))
            self.worker_threads.append(t)

        # Kick-off all threads
        for t in self.worker_threads:
            t.start()

        logger.info('Worker threads started')

    def cli(self):
        """
        CLI thread
        :return:
        """
        logger.info('CLI thread started')
        self.cmdloop()
        logger.info('Terminating CLI thread')

    def _init_queue(self):
        """
        Initializes link queue
        :return: 
        """
        sess = self.session()
        logger.debug('Loading users...')

        users_cnt = 0
        all_keys = sess.query(GitHubKey).filter(
            GitHubKey.is_interesting == 1).all()
        logger.debug('All users loaded')

        for rec in all_keys:
            users_cnt += 1

            job = DownloadJob(url=self.USER_DETAIL_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_ORGS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_ORG,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_REPOS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

        logger.info('Queue initialized, users cnt: %s' % users_cnt)
        utils.silent_close(sess)

    def _init_queue_assignee(self):
        """
        Init queue with assignees repos
        :return: 
        """
        sess = self.session()
        logger.debug('Loading repos...')

        users_cnt = 0
        repos = sess.query(GitHubRepo).filter(
            GitHubRepo.user_repo == 0).filter(
                GitHubRepo.repo_stars >= 100).all()
        logger.debug('Repos loaded: %s' % len(repos))

        for rec in repos:
            users_cnt += 1

            job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL % rec.repo_name,
                              jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                              meta={
                                  'user': rec.owner_login,
                                  'user_id': rec.owner_id,
                                  'repo': rec.repo_name,
                                  'page': 1
                              })
            self.link_queue.put(job)

    def work(self):
        """
        Main thread work method
        :return:
        """
        # Interrupt signals
        signal.signal(signal.SIGINT, self.signal_handler)

        self.init_config()
        self.init_db()

        # Resume last state
        self.state_resume()

        # Monitor threads.
        self.state_thread = threading.Thread(target=self.state_main, args=())
        self.state_thread.start()

        # If there is no link to process - create from since.
        if self.link_queue.qsize() == 0:
            if self.args.assign_only:
                self._init_queue_assignee()
            else:
                self._init_queue()

        # Worker threads
        self.init_workers()

        logger.info('Main thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))

        # CLI thread
        cli_thread = threading.Thread(target=self.cli, args=())
        cli_thread.setDaemon(True)
        cli_thread.start()

        # Join on workers
        self.after_loop()
        logger.info('Terminating main thread')
        return None

    def after_loop(self, wait_for_state=True):
        """
        After work loop finishes
        :return:
        """
        logger.info('Waiting termination of slave threads')

        # Wait here for termination of all workers and monitors.
        try:
            for t in self.worker_threads:
                t.join()

            if wait_for_state:
                self.state_thread.join()
        except:
            logger.error('Exception during thread join')
            logger.error(traceback.format_exc())

        logger.info('All threads terminates, last state save')
        self.state_save()

    #
    # General link processing & queue management
    #

    def work_thread_main(self, idx):
        """
        Worker thread main loop
        :return:
        """
        self.local_data.idx = idx
        logger.info('Working thread %d started' % idx)

        while not self.terminate and not self.stop_event.is_set():
            self.interruptible_sleep_delta(0.1)

            # Get credential to process link with
            resource = self.resource_allocate()
            if resource is None:
                continue

            # We have resource, now get the job
            job = None
            try:
                job = self.link_queue.get(True, timeout=1.0)
            except Queue.Empty:
                self.resource_return(resource)
                continue

            # If job last fail is too recent - put again back to queue
            if time.time() - job.last_fail < 3.0:
                self.link_queue.put(
                    job
                )  # re-insert to the back of the queue for later processing
                self.resource_return(resource)
                continue

            # Job processing starts here - fetch data page with the resource.
            js_data = None
            try:
                self.local_data.job = job
                self.local_data.resource = resource
                js_data, headers, raw_response = self.load_page_local()

            except RateLimitHit as e:
                logger.error(
                    '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))
                continue

            except Exception as e:
                logger.error(
                    '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))

                self.on_job_failed(job)
                continue

            finally:
                self.resource_return(resource)
                self.local_data.resource = None
                self.local_data.last_usr = resource.usr
                self.local_data.last_remaining = resource.remaining

            # Process downloaded data here.
            try:
                if js_data is None:
                    self.audit_log('404', job.url, jtype=job.type, job=job)
                    self.flush_audit()
                    continue

                self.process_downloaded(job, js_data, headers, raw_response)

            except Exception as e:
                logger.error(
                    '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s'
                    % (idx, job.type, job.url, job.fail_cnt, resource.usr, e))

                traceback.print_exc()
                self.on_job_failed(job)
            finally:
                self.local_data.resource = None
                self.local_data.job = None
                self.local_data.last_usr = None
                self.local_data.last_remaining = None
                resource = None
                job = None
                headers = None
                raw_response = None

        pass
        logger.info('Terminating worker thread %d' % idx)

    def on_job_failed(self, job):
        """
        If job failed, this teaches it how to behave
        :param job:
        :return:
        """
        job.fail_cnt += 1
        job.last_fail = time.time()

        # if failed too many times - log and discard.
        if job.fail_cnt > 35:
            logger.warning('Job failed too many times %s' % job.url)
            self.audit_log('too-many-fails', job.url, jtype=job.type, job=job)
            self.flush_audit()
        else:
            self.link_queue.put(
                job)  # re-insert to the queue for later processing

    def load_page_local(self):
        """
        Loads page stored in thread local
        :return:
        """

        auth = None
        resource = self.local_data.resource
        if resource.usr is not None:
            auth = HTTPBasicAuth(resource.usr, resource.token)

        job = self.local_data.job

        res = requests.get(job.url, timeout=10, auth=auth)
        headers = res.headers

        resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset'))
        resource.remaining = utils.try_int(
            headers.get('X-RateLimit-Remaining'))
        resource.last_used = time.time()
        resource.used_cnt += 1

        if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10:
            resource.fail_cnt += 1
            raise RateLimitHit

        if res.status_code == 404:
            resource.fail_cnt += 1
            logger.warning('URL not found: %s' % job.url)
            return None, None, None

        if res.status_code // 100 != 2:
            resource.fail_cnt += 1
            res.raise_for_status()

        data = res.content
        if data is None:
            resource.fail_cnt += 1
            raise Exception('Empty response')

        js = json.loads(data, object_pairs_hook=OrderedDict)
        return js, headers, res

    #
    # Crawl specific methods
    #

    def process_downloaded(self, job, js_data, headers, raw_response):
        """
        Process downloaded page
        :param self: 
        :param job: 
        :param js_data: 
        :param headers: 
        :param raw_response: 
        :return: 
        """
        if job.type == DownloadJob.TYPE_USER:
            self.process_user(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_ORG:
            self.process_org(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_REPOS_USER:
            self.process_repo(job, js_data, headers, raw_response, True)
        elif job.type == DownloadJob.TYPE_REPOS_ORG:
            self.process_repo(job, js_data, headers, raw_response, False)
        elif job.type == DownloadJob.TYPE_REPO_COLAB:
            self.process_colab(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE:
            self.process_assignee(job, js_data, headers, raw_response)
        else:
            logger.error('Unrecognized type %s' % job.type)

    def process_user(self, job, js, headers, raw_response):
        """
        Process user detail data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        if 'id' not in js:
            logger.error('Field ID not found in user')
            return

        s = self.session()
        try:
            user_id = int(js['id'])
            dbu = s.query(GitHubUserDetails).filter(
                GitHubUserDetails.id == user_id).one_or_none()
            is_new = False

            if dbu is None:
                is_new = True
                dbu = GitHubUserDetails()
                dbu.id = user_id

            dbu.date_last_check = salch.func.now()
            dbu.username = js['login']
            dbu.name = utils.utf8ize(utils.defvalkey(js, 'name'))

            dbu.company = utils.utf8ize(utils.defvalkey(js, 'company'))
            dbu.blog = utils.defvalkey(js, 'blog')
            dbu.email = utils.defvalkey(js, 'email')
            dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio'))
            dbu.usr_type = utils.defvalkey(js, 'type')

            dbu.public_repos = js['public_repos']
            dbu.public_gists = js['public_gists']
            dbu.followers = js['followers']
            dbu.following = js['following']

            dbu.created_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'created_at')))
            dbu.updated_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at')))

            if is_new:
                s.add(dbu)
            else:
                s.merge(dbu)
            s.commit()
            s.flush()
            s.expunge_all()

        except Exception as e:
            logger.error('Exception storing user details: %s: %s' %
                         (js['id'], e))
            logger.debug(traceback.format_exc())

        finally:
            utils.silent_close(s)

    def process_org(self, job, js, headers, raw_response):
        """
        Process user -> orgs data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        new_orgs = []
        for org in js:
            if 'id' not in org:
                logger.error('Field ID not found in orgs')
                continue

            s = self.session()
            try:
                org_id = int(org['id'])

                # delete first - avoid excs
                s.query(GitHubUserOrgs)\
                    .filter(GitHubUserOrgs.org_id == org_id)\
                    .filter(GitHubUserOrgs.username == job.meta['user'])\
                    .delete()

                dbu = GitHubUserOrgs()
                dbu.username = job.meta['user']
                dbu.org_id = org['id']
                dbu.org_name = org['login']
                dbu.org_desc = utils.utf8ize(org['description'])
                new_orgs.append(org['login'])

                s.add(dbu)

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing user->org details: %s: %s' %
                             (org['id'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' %
                                                             (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_ORG,
                          meta=new_meta)
        self.link_queue.put(job)

        # Load repositories for new organisations
        not_loaded_orgs = None
        with self.orgs_loaded_lock:
            new_orgs_set = set(new_orgs)
            not_loaded_orgs = new_orgs_set - self.orgs_loaded_set
            for x in new_orgs:
                self.orgs_loaded_set.add(x)

        for x in not_loaded_orgs:
            new_meta = dict(job.meta)
            new_meta['page'] = 1
            new_meta['org'] = x
            job = DownloadJob(url=self.ORG_REPOS_URL % x,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)
            self.link_queue.put(job)

    def process_repo(self, job, js, headers, raw_response, from_user):
        """
        Process repo list page
        :param job: 
        :param js: 
        :param headers: 
        :param raw_response: 
        :param from_user: 
        :return: 
        """
        for repo in js:
            if 'id' not in repo:
                logger.error('Field ID not found in repos')
                continue

            s = self.session()
            try:
                repo_id = int(repo['id'])
                dbe = s.query(GitHubRepo).filter(
                    GitHubRepo.id == repo_id).one_or_none()

                dbu = GitHubRepo()
                dbu.id = repo_id
                dbu.user_repo = from_user
                if from_user:
                    dbu.username = job.meta['user']
                else:
                    dbu.org_name = job.meta['org']

                if 'owner' in repo:
                    dbu.owner_id = repo['owner']['id']
                    dbu.owner_login = repo['owner']['login']

                dbu.repo_name = repo['full_name']
                dbu.repo_stars = repo['stargazers_count']
                dbu.repo_forks = repo['forks']
                dbu.repo_watchers = repo['watchers']
                dbu.repo_is_fork = repo['fork']
                dbu.repo_size = repo['size']
                dbu.repo_homepage = utils.defvalkey(repo, 'homepage')
                dbu.repo_language = utils.defvalkey(repo, 'language')
                dbu.created_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'created_at')))
                dbu.updated_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'updated_at')))
                dbu.pushed_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'pushed_at')))

                dbu.repo_description = utils.utf8ize(repo['description'])

                dbu.repo_stargazers_url = repo['stargazers_url']
                dbu.repo_forks_url = repo['forks_url']

                if not from_user and repo['stargazers_count'] > 100:
                    new_meta = dict(job.meta)
                    new_meta['page'] = 1
                    new_meta['repo'] = repo['full_name']
                    new_meta['owner'] = repo['owner']['login']

                    # Colab fetch - skip, no auth
                    job = DownloadJob(url=self.ORG_REPO_COLAB_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_COLAB,
                                      meta=new_meta)

                    # Asignee fetch
                    job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                                      meta=dict(new_meta))

                    self.link_queue.put(job)

                # DB save
                if dbe is None:
                    s.add(dbu)

                else:
                    if dbe.username != dbu.username:
                        logger.warning('Username does not match for %s %s %s' %
                                       (repo_id, dbe.username, dbu.username))
                    if dbe.org_name != dbu.org_name:
                        logger.warning('org_name does not match for %s %s %s' %
                                       (repo_id, dbe.org_name, dbu.org_name))
                    if dbe.owner_login != dbu.owner_login:
                        logger.warning(
                            'owner_login does not match for %s %s %s' %
                            (repo_id, dbe.owner_login, dbu.owner_login))

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error(
                    'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s'
                    % (repo['id'], repo['full_name'], json.dumps(
                        job.meta), job.url, e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        if from_user:
            new_url = (self.USER_REPOS_URL %
                       job.meta['user']) + ('?page=%s' % (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta=new_meta)
        else:
            new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' %
                                                                (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)

        self.link_queue.put(job)

    def process_colab(self, job, js, headers, raw_response):
        """
        Process colaborators for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for colab in js:
            if 'id' not in colab:
                logger.error('Field ID not found in colab')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoColab)\
                    .filter(GitHubRepoColab.user_name == colab['login'])\
                    .filter(GitHubRepoColab.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoColab()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = colab['login']
                dbu.can_pull = colab['permissions']['pull']
                dbu.can_push = colab['permissions']['push']
                dbu.can_admin = colab['permissions']['admin']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing colab details: %s:%s: %s' %
                             (colab['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_COLAB_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_COLAB,
                          meta=new_meta)
        self.link_queue.put(job)

    def process_assignee(self, job, js, headers, raw_response):
        """
        Process assignees for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for assignee in js:
            if 'id' not in assignee:
                logger.error('Field ID not found in assignees')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoAssignee)\
                    .filter(GitHubRepoAssignee.user_name == assignee['login'])\
                    .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoAssignee()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = assignee['login']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing cassignee details: %s:%s: %s' %
                             (assignee['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_ASSIGNEES_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                          meta=new_meta)
        self.link_queue.put(job)

    #
    # Resource management
    #

    def resource_allocate(self, blocking=True, timeout=1.0):
        """
        Takes resource from the pool.
        If the resource has low remaining credit, thread is suspended to re-charge.
        :return: resource or None if not available in the time
        """
        try:
            resource = self.resources_queue.get(True, timeout=1.0)
            if resource.remaining is not None and resource.remaining <= self.threads + 2:
                sleep_sec = resource.reset_time - time.time(
                ) if resource.reset_time is not None else 15 * 60
                sleep_sec += 120  # extra 2 minutes to avoid problems with resources

                logger.info(
                    'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, '
                    '%d minutes' %
                    (resource.usr, resource.remaining, resource.reset_time,
                     sleep_sec, sleep_sec / 60.0))
                self.sleep_interruptible(time.time() + sleep_sec)
                logger.info('Resource sleep finished %s' % resource.usr)

                # Reset estimations, needs to be refreshed
                resource.remaining = None
                resource.reset_time = None

            return resource

        except Queue.Empty:
            return None

    def resource_return(self, res):
        """
        Returns resource to the pool
        :param res:
        :return:
        """
        self.resources_queue.put(res)

    def sleep_interruptible(self, until_time):
        """
        Interruptible sleep - sleep until given time.
        :param until_time:
        :return:
        """
        while time.time() <= until_time:
            time.sleep(1.0)
            if self.terminate or self.stop_event.is_set():
                return

    def interruptible_sleep_delta(self, sleep_time):
        """
        Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread.
        :param sleep_time:
        :return:
        """
        if sleep_time is None:
            return

        sleep_time = float(sleep_time)

        if sleep_time == 0:
            return

        sleep_start = time.time()
        while not self.stop_event.is_set() and not self.terminate:
            time.sleep(0.1)
            if time.time() - sleep_start >= sleep_time:
                return

    #
    # State
    #

    def flush_state(self):
        """
        Flushes state/config to the state file
        :return:
        """
        self.state['rate_limit_remaining'] = self.rate_limit_remaining
        self.state['rate_limit_reset'] = self.rate_limit_reset
        utils.flush_json(self.state, self.state_file_path)

    #
    # Auditing - errors, problems for further analysis
    #

    def audit_log(self, evt=None, link=None, jtype=None, job=None):
        """
        Appends audit log to the buffer. Lock protected.
        :param evt:
        :param link:
        :return:
        """
        log = collections.OrderedDict()
        log['time'] = time.time()
        log['evt'] = evt
        log['jtype'] = jtype
        log['link'] = link

        if job is not None and isinstance(job, DownloadJob):
            log['job'] = job.to_json()

        with self.audit_lock:
            self.audit_records_buffered.append(log)

    def flush_audit(self):
        """
        Flushes audit logs to the JSON append only file.
        Routine protected by the lock (no new audit record can be inserted while holding the lock)
        :return:
        """
        if self.audit_file is None:
            self.audit_records_buffered = []
            return

        self.audit_lock.acquire()
        try:
            if len(self.audit_records_buffered) == 0:
                return
            with open(self.audit_file, 'a') as fa:
                for x in self.audit_records_buffered:
                    fa.write(json.dumps(x, cls=utils.AutoJSONEncoder) + "\n")
            self.audit_records_buffered = []
        except Exception as e:
            logger.error('Exception in audit log dump %s' % e)
        finally:
            self.audit_lock.release()

    #
    # State save / resume
    #

    def state_main(self):
        """
        State thread - periodical dump of the queues.
        :return:
        """
        logger.info('State thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))
        try:
            while not self.stop_event.is_set() and not self.terminate:
                # Dump stats each x seconds
                # Sleep is here because of dumping the state for the last time just before program quits.
                self.interruptible_sleep_delta(2)
                self.state_save()

                # Check memory conditions
                self.state_ram_check()

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s' % e)

        finally:
            pass

        logger.info('State loop terminated')

    def state_ram_check(self):
        """
        Checks memory terminating conditions
        :return:
        """

        if self.max_mem is None:
            return

        cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if cur_ram <= self.max_mem:
            return

        logger.warning(
            'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB'
            % (cur_ram, cur_ram / 1024.0, self.max_mem))
        self.trigger_stop()

    def state_gen(self):
        """
        Dumps state
        :return:
        """
        def link_char(job):
            if job.type == DownloadJob.TYPE_USER:
                return 'U'
            elif job.type == DownloadJob.TYPE_ORG:
                return 'o'
            elif job.type == DownloadJob.TYPE_REPOS_USER:
                return '.'
            elif job.type == DownloadJob.TYPE_REPOS_ORG:
                return ','
            elif job.type == DownloadJob.TYPE_REPO_COLAB:
                return 'c'
            elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE:
                return '-'
            else:
                return '!'

        try:
            js_q = collections.OrderedDict()
            js_q['gen'] = time.time()
            js_q['link_size'] = self.link_queue.qsize()
            js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

            # Dequeues
            self.new_users_events.maintain()
            self.new_keys_events.maintain()

            users_in_5min = self.new_users_events.under_limit(5 * 60)
            keys_in_5min = self.new_keys_events.under_limit(5 * 60)

            js_q['users_dequeue_size'] = self.new_users_events.len()
            js_q['keys_dequeue_size'] = self.new_keys_events.len()
            js_q['users_5min'] = users_in_5min
            js_q['keys_5min'] = keys_in_5min
            js_q['users_1min'] = users_in_5min / 5.0
            js_q['keys_1min'] = keys_in_5min / 5.0

            # link queue structure
            qdata = list(self.link_queue.queue)
            qdata.sort(cmp=DownloadJob.cmp)
            js_q['link_structure'] = ''.join([link_char(x) for x in qdata])

            # Stats.
            js_q['resource_stats'] = [
                x.to_json() for x in list(self.resources_list)
            ]

            # Finally - the queue
            js_q['link_queue'] = [x.to_json() for x in qdata]
            return js_q

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s', e)

    def state_save(self):
        """
        saves the state
        :return:
        """
        try:
            js_q = self.state_gen()
            utils.flush_json(js_q, self.state_file_path)

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s', e)

    def state_resume(self):
        """
        Attempts to resume the queues from the monitoring files
        :return:
        """
        try:
            if self.state is None:
                return

            if 'link_queue' in self.state:
                for rec in self.state['link_queue']:
                    job = DownloadJob.from_json(rec)
                    self.link_queue.put(job)
                logger.info('Link queue resumed, entries: %d' %
                            len(self.state['link_queue']))

        except Exception as e:
            traceback.print_exc()
            logger.warning('Exception in resuming the state %s' % e)
            logger.error('State resume failed, exiting')
            sys.exit(1)

    def main(self):
        """
        Main entry point
        :return: 
        """
        args_src = sys.argv
        parser = argparse.ArgumentParser(
            description='Downloads GitHub User info')
        parser.add_argument('-c',
                            dest='config',
                            default=None,
                            help='JSON config file')
        parser.add_argument('-s',
                            dest='status',
                            default=None,
                            help='JSON status file')
        parser.add_argument('-t',
                            dest='threads',
                            default=1,
                            type=int,
                            help='Number of download threads to use')
        parser.add_argument(
            '--max-mem',
            dest='max_mem',
            default=None,
            type=int,
            help='Maximal memory threshold in kB when program terminates itself'
        )
        parser.add_argument('--assign-only',
                            dest='assign_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='fetch assignees')

        args = self.args = parser.parse_args(args=args_src[1:])

        self.threads = args.threads
        self.max_mem = args.max_mem

        self.config_file = args.config
        self.audit_file = os.path.join(os.getcwd(), 'audit.json')
        self.state_file_path = args.status if args.status is not None else os.path.join(
            os.getcwd(), 'state.json')

        if os.path.exists(self.state_file_path):
            utils.file_backup(self.state_file_path, backup_dir='.')

        if os.path.exists('.github-quit'):
            os.remove('.github-quit')

        sys.argv = [args_src[0]]
        logger.info('GitHub loader started, args: %s' % args)

        self.work()
        sys.argv = args_src