Python SummaryTracker Examples

Programming Language: Python

Namespace/Package Name: pympler.tracker

Class/Type: SummaryTracker

Examples at hotexamples.com: 17

Python SummaryTracker - 17 examples found. These are the top rated real world Python examples of pympler.tracker.SummaryTracker extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SummaryTracker(13)

print_diff(12)

diff(2)

Example #1

Show file

def main():
    n_max = 10000  # n first bboxes to process
    n_frames_max = 3000
    mask_thresh = 35

    videos_dir = "data/stanford_drone/videos/"

    scene_names = ["bookstore", "coupa", "deathCircle", "gates", "hyang", "little", "nexus", "quad"]
    # val_videos_paths = [os.path.join(videos_dir, scene, "video0") for scene in scene_names]
    # show_and_save_video("data/stanford_drone/videos/coupa/video2", n_max, n_frames_max, mask_thresh)
    # exit(1)

    # to show [0!(hard dark low res), 1(car),4], 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality),
    # final to show 1(car), 5! (car), 8! (large shadow), 11(car), 21!(good quality),26(fence), 31(solid persons), 36 (good quality)
    i_video = 0
    for scene in scene_names:
        scene_subdirs = os.listdir(os.path.join(videos_dir, scene))
        for vid_dir_name in scene_subdirs:
            tracker = SummaryTracker()

            vid_dir = os.path.join(videos_dir, scene, vid_dir_name)
            print("-----------------")
            print(i_video, vid_dir)
            # "data/stanford_drone/videos/coupa/video2"
            store_as_dataset(vid_dir, n_max, n_frames_max, mask_thresh)
            i_video += 1

            tracker.print_diff()

Example #2

Show file

def test_SelectMemoryLeak():
    N = 50000

    def table_set(t, state):
        fvs = swsscommon.FieldValuePairs([("status", state)])
        t.set("123", fvs)

    def generator_SelectMemoryLeak():
        app_db = swsscommon.DBConnector("APPL_DB", 0, True)
        t = swsscommon.Table(app_db, "TABLE")
        for i in range(int(N / 2)):
            table_set(t, "up")
            table_set(t, "down")

    tracker = SummaryTracker()
    appl_db = swsscommon.DBConnector("APPL_DB", 0, True)
    sel = swsscommon.Select()
    sst = swsscommon.SubscriberStateTable(appl_db, "TABLE")
    sel.addSelectable(sst)
    thr = Thread(target=generator_SelectMemoryLeak)
    thr.daemon = True
    thr.start()
    time.sleep(5)
    for _ in range(N):
        state, c = sel.select(1000)
    diff = tracker.diff()
    cases = []
    for name, count, _ in diff:
        if count >= N:
            cases.append("%s - %d objects for %d repeats" % (name, count, N))
    thr.join()
    assert not cases

Example #3

Show file

File: celery.py Project: uzbekdev1/agir.lafranceinsoumise.fr

def get_memory_tracker():
    global _memory_tracker
    if _memory_tracker is None:
        from pympler.tracker import SummaryTracker

        _memory_tracker = SummaryTracker()
    return _memory_tracker

Example #4

Show file

File: test_Reopt_sizing.py Project: NREL/pysam

def test_reopt_sizing_pvwatts(solar_resource):
    round = 0

    tracker = SummaryTracker()
    while round < 25:   # multiple runs required to check for memory leaks
        round += 1

        sys = pv.default("PVWattsBatteryCommercial")
        sys.SolarResource.solar_resource_file = solar_resource
        batt = bt.from_existing(sys, "PVWattsBatteryCommercial")
        sys.SolarResource.solar_resource_data = dict({'lat': 3, 'lon': 3})
        batt.Battery.crit_load = [0] * 8760
        fin = ur.from_existing(sys, "PVWattsBatteryCommercial")

        post = sys.Reopt_size_battery_post()

    assert('Scenario' in post['reopt_post'])
    assert(post['reopt_post']['Scenario']['Site']['latitude'] == pytest.approx(3, 0.1))
    tracker_diff = tracker.diff()
    tracker.print_diff()

Example #5

Show file

 def __init__(self, lock, queue, sniffer, analyzer, AerospikeClient):
     self.lock = lock
     self.queue = queue
     self.sniffer = sniffer
     self.analyzer = analyzer
     self.timestamp = ""
     self.logger = logging.getLogger(__name__)
     self.attack_time, self.attack_percent = ConfigurationReader.scriptExecutionInfo(
     )
     self.first_value = 0
     self.attack_alert = 0
     self.Aero = AerospikeClient
     self.attack_alert_limit = (
         (self.attack_time * 10) / 100) * self.attack_percent
     self.custom_script_counter = ConfigurationReader.higherScriptExecution(
     )
     self.script_execution_counter = self.custom_script_counter
     self.IsAttack = 0
     self.selftracker = SummaryTracker()

Example #6

Show file

    def start(self):
        if self.debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()
        if self.debugging_level >= 2:
            from pympler.tracker import SummaryTracker
            tracker = SummaryTracker()

        # context must be created here because this function may be executed in a separate
        # process/thread
        self.connect()
        heartbeat_ts = time.time()

        # GC causes unplanned stall and disrupts precisely timed collection.
        # Disable it and do it manually before sleeping.
        gc.disable()
        try:
            logger.info('MA started')
            while not self.stopped:
                if self.collectors:
                    ts = time.time()
                    if ts - (
                            self.last_collect_second +
                            self.collect_time_decimal) >= self.tick_len - 0.01:
                        # This must be updated *before* collecting to prevent the send time from
                        # slowly drifting away
                        self.last_collect_second = int(ts)
                        result = []
                        for c in self.collectors:
                            result.extend(c())
                        logger.info('Collected: ' + str(result))
                        self.timestamp_and_send_obj(result, ts)
                    else:
                        pass
                else:
                    self.last_collect_second = time.time()

                gc.collect()
                flush_log()

                # Print out memory usage every minute
                if self.debugging_level >= 2 and int(time.time()) % 60 == 0:
                    print('Time: ' + time.asctime(time.localtime(time.time())))
                    tracker.print_diff()

                # Calculate the precise time for next collection
                sleep_second = self.last_collect_second + self.collect_time_decimal + self.tick_len - time.time(
                )
                sleep_second = max(sleep_second, 0)

                sleep_start_ts = time.time()
                p = dict(self.poller.poll(sleep_second * 1000))
                logger.debug('Slept {0} seconds'.format(time.time() -
                                                        sleep_start_ts))
                if self.socket in p:
                    req = pickle.loads(zlib.decompress(self.socket.recv()))
                    assert req[0] == LustreCommon.protocol_ver
                    heartbeat_ts = time.time()
                    if isinstance(req[2], bytes):
                        # this is a command, not data
                        cmd = req[2]
                        if cmd == b'ACTION':
                            action = req[3]
                            if action == 0:
                                logger.info('Received action 0, ignored')
                            else:
                                logger.info(
                                    'Performing action {action}'.format(
                                        action=action))
                                self.controller(req[3:])
                        elif cmd == b'HB':
                            logger.debug('Received heartbeat')
                        else:
                            logger.warning('Unknown command received: ' + cmd)
                    else:
                        logger.error('Corrupted message received: ' + req)
                else:
                    if heartbeat_ts and time.time() - heartbeat_ts > 5:
                        # reconnect
                        self.disconnect()
                        self.connect()

                        logger.warning('Connection timeout, reconnected')
                        heartbeat_ts = time.time()

            logger.info('MA stopped')
        finally:
            gc.enable()

            if self.debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())

Example #7

Show file

File: mem_util.py Project: dfaivre/python-ml-poc-2018

import psutil
from pympler.tracker import SummaryTracker

_tr = SummaryTracker()
_print_mem_enabled = False
_mem_used = 0
_prev_mem_used = 0


def disable_print_mem():
    global _print_mem_enabled
    _print_mem_enabled = False


def enable_print_mem():
    global _print_mem_enabled
    _print_mem_enabled = True


def print_mem_usage():
    global _mem_used, _prev_mem_used, _print_mem_enabled

    if not _print_mem_enabled:
        return

    # _tr.print_diff()
    mem = psutil.virtual_memory()
    _prev_mem_used = _mem_used
    _mem_used = (mem.total - mem.available) / 1024 / 1024
    print(f"virt_mem >> used: {_mem_used:.0f}, prev: {_prev_mem_used:.0f}, "
          f"diff: {_mem_used - _prev_mem_used:.0f}")

Example #8

Show file

from pympler.tracker import SummaryTracker

tracker = SummaryTracker()
tracker.print_diff()
s = "str"
tracker.print_diff()

Example #9

Show file

def add_single_experiment(directory, df_filepath, datasets):
    csv_filenames = [
        f for f in os.listdir(directory)
        if f[0:5] == "resul" and f[-4:] == ".csv"
    ]
    print(list(os.walk(directory)))
    weights_folder = [f for f in os.walk(directory)][0][1][0]
    for filename in csv_filenames:
        experiment = directory_to_experiment_info(directory)
        with open("./" + directory + "/" + filename, "r") as csvfile:
            logging.debug("./" + directory + "/" + filename)
            plots = csv.reader(csvfile, delimiter=",")
            headers = next(plots, None)
            for header in headers:
                experiment[header] = []
            for row in plots:
                for i, header in enumerate(headers):
                    experiment[header].append(float(row[i]))

        experiment["minimum_val_loss"] = min(experiment["val_loss"])
        experiment["minimum_loss"] = min(experiment["loss"])
        number = int(filename.split("-")[-1].split(".")[0])
        experiment["number"] = number
        experiment["repeat"] = math.floor(number / 4)
        if (number % 4) / 2 < 1:
            experiment["elastic_distortions"] = True
        else:
            experiment["elastic_distortions"] = False
        if (number % 4) % 2 != 0:
            experiment["separate_channel_ops"] = True
        else:
            experiment["separate_channel_ops"] = False
        print(experiment.keys())

        if not experiment_in_dataframe(df_filepath, experiment):
            weights_file = (directory + "/" + weights_folder + "/" +
                            "mask_rcnn_fk2018_best.h5")
            config = FK2018.FKConfig()
            print("got config")

            class InferenceConfig(config.__class__):
                # Run detection on one image at a time
                GPU_COUNT = 1
                IMAGES_PER_GPU = 1

            config = InferenceConfig()
            config.display()
            # Device to load the neural network on.
            # Useful if you're training a model on the same
            # machine, in which case use CPU and leave the
            # GPU for training.
            # DEVICE = "/gpu:0"  # /cpu:0 or /gpu:0

            # Inspect the model in training or inference modes
            # values: 'inference' or 'training'
            # TODO: code for 'training' test mode not ready yet
            TEST_MODE = "inference"

            # Must call before using the dataset

            # with tf.device(DEVICE):
            tf_config = tf.ConfigProto()
            tf_config.gpu_options.allow_growth = True
            with tf.Session(config=tf_config).as_default():
                model = modellib.MaskRCNN(mode="inference",
                                          model_dir="./log",
                                          config=config)

                # Load weights
                logging.debug("Loading weights " + str(weights_file))
                model.load_weights(weights_file, by_name=True)
                image_ids = datasets[0].image_ids

                tracker = SummaryTracker()
                logging.debug("getting stats")
                logging.debug("tunasand stats")

                experiment["AP_list"], \
                experiment["classless_AP_list"], \
                experiment["precision_list"], \
                experiment["classless_precision_list"], \
                experiment["recall_list"], \
                experiment["classless_recall_list"], \
                experiment["predicted_class_list"], \
                experiment["gt_class_list"], \
                experiment["predicted_size_list"], \
                experiment["gt_size_list"], \
                experiment["overlaps"], \
                experiment["classless_overlaps_list"], \
                experiment["total_predicted_pixels"], \
                experiment["total_groundtruth_pixels"], \
                experiment["total_overlapping_pixels"] = compute_both_batch_aps(image_ids, datasets[0], model, config) #get_stats(weights_file, datasets[0])

                objgraph.show_most_common_types()
                roots = objgraph.get_leaking_objects()
                print(len(roots))
                tracker.print_diff()

                for i, dataset in enumerate(
                    ["AE_area1", "AE_area2", "AE_area3"]):
                    image_ids = datasets[i + 1].image_ids
                    logging.debug(f"aestats, {dataset}")
                    experiment[f"AP_list_{dataset}"], \
                    experiment[f"classless_AP_list_{dataset}"], \
                    experiment[f"precision_list_{dataset}"], \
                    experiment[f"classless_precision_list_{dataset}"], \
                    experiment[f"recall_list_{dataset}"], \
                    experiment[f"classless_recall_list_{dataset}"], \
                    experiment[f"predicted_class_list_{dataset}"], \
                    experiment[f"gt_class_list_{dataset}"], \
                    experiment[f"predicted_size_list_{dataset}"], \
                    experiment[f"gt_size_list_{dataset}"], \
                    experiment[f"overlaps_{dataset}"], \
                    experiment[f"classless_overlaps_list_{dataset}"], \
                    experiment[f"total_predicted_pixels_{dataset}"], \
                    experiment[f"total_groundtruth_pixels_{dataset}"], \
                    experiment[f"total_overlapping_pixels_{dataset}"]  = compute_both_batch_aps(image_ids, datasets[i+1], model, config) #get_stats(weights_file, datasets[i+1])
                    objgraph.show_growth()
                    roots = objgraph.get_leaking_objects()
                    print(len(roots))
                    tracker.print_diff()

            update_dataframe(df_filepath, experiment)
        else:
            print("already in dataframe, skipping " + filename)

Example #10

Show file

File: test_mem_leak.py Project: f5devcentral/campfire

import sys, os
import time
from pympler.tracker import SummaryTracker
import testsuites
import flaminglog.logrule as flaminglog

#runs a memory profiler
tracker = SummaryTracker()

print(tracker.print_diff())
for i in range(0,100):
    #runs all tests except cleanup
    testsuites.run_full_suite(0)
    if(i%10==0):
        print(tracker.print_diff())

#sleep to allow any garbage collection to fully finish
time.sleep(10)

print(tracker.print_diff())
print("If the '# objects' and the 'total size' are both low numbers then"
        " there is no memory leak")

Example #11

Show file

class GitHubLoader(Cmd):
    """
    GitHub SSH keys loader
    """
    prompt = '$> '

    LINK_FACTOR = 70
    USER_URL = 'https://api.github.com/users/%s'
    USERS_URL = 'https://api.github.com/users?since=%s'
    KEYS_URL = 'https://api.github.com/users/%s/keys'
    KEYS_ID_URL = 'https://api.github.com/user/%s/keys'

    def __init__(self,
                 attempts=5,
                 threads=1,
                 state=None,
                 state_file=None,
                 config_file=None,
                 audit_file=None,
                 max_mem=None,
                 users_only=False,
                 merge=False,
                 update_keys=False,
                 *args,
                 **kwargs):

        Cmd.__init__(self, *args, **kwargs)
        self.t = Terminal()
        self.trace_logger = Tracelogger(logger=logger)

        self.attempts = int(attempts)
        self.total = None
        self.terminate = False
        self.since_id = 0
        self.last_users_count = None
        self.user_lock = Lock()
        self.processed_user_set = set()
        self.processed_user_set_lock = Lock()

        self.max_mem = max_mem
        self.merge = merge
        self.update_keys = update_keys
        self.users_only = users_only
        self.users_per_page = 30
        self.users_bulk_load_pages = 500
        self.user_load_bulk = 5000
        self.user_refill_lock = Lock()
        self.state = state
        self.state_file_path = state_file
        self.rate_limit_reset = None
        self.rate_limit_remaining = None

        self.config = None
        self.config_file = config_file

        self.audit_file = audit_file
        self.audit_records_buffered = []
        self.audit_lock = Lock()

        self.stop_event = threading.Event()
        self.threads = int(threads)
        self.link_queue = Queue.PriorityQueue()  # Store links to download here
        self.worker_threads = []

        self.state_thread = None
        self.state_thread_lock = Lock()

        self.resources_list = []
        self.resources_queue = Queue.PriorityQueue()
        self.local_data = threading.local()

        self.new_users_events = EvtDequeue()
        self.new_keys_events = EvtDequeue()

        self.db_config = None
        self.engine = None
        self.session = None

        self.mem_tracker = None

    def signal_handler(self, signal, frame):
        """
        Signal handler - terminate gracefully
        :param signal:
        :param frame:
        :return:
        """
        logger.info('CTRL+C pressed')
        self.trigger_stop()

    def trigger_stop(self):
        """
        Sets terminal conditions to true
        :return:
        """
        self.terminate = True
        self.stop_event.set()

    def trigger_quit(self):
        """
        Terminal condition & file change
        :return:
        """
        self.trigger_stop()
        utils.try_touch('.github-quit')

    #
    # CMD handlers
    #

    def do_quit(self, arg):
        self.trigger_quit()
        logger.info('Waiting for thread termination')

        time.sleep(1)
        logger.info('Quitting')
        return Cmd.do_quit(self, arg)

    def do_reset(self, line):
        print('\033c')

    def do_gc(self, line):
        gc.collect()

    def do_mem_top(self, line):
        print(mem_top.mem_top())

    def do_mem_track_init(self, line):
        self.mem_tracker = SummaryTracker()

    def do_mem_track_diff(self, line):
        print(self.mem_tracker.print_diff())

    def do_mem_track_deinit(self, line):
        self.mem_tracker = None

    def do_mem(self, line):
        print('Memory usage: %s kB' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    def do_state(self, line):
        js = self.state_gen()

        if line is None or len(line) == 0:
            del js['link_queue']
            del js['resource_stats']
        elif line == '1':
            del js['link_queue']

        print(json.dumps(js, indent=2))

    def do_deq_enable(self, line):
        self.new_keys_events.disabled = False
        self.new_users_events.disabled = False

    def do_deq_disable(self, line):
        self.new_keys_events.disabled = True
        self.new_users_events.disabled = True

    #
    # Init
    #

    def init_config(self):
        """
        Loads config & state files
        :return:
        """
        if self.state_file_path is not None and os.path.exists(
                self.state_file_path):
            with open(self.state_file_path, 'r') as fh:
                self.state = json.load(fh, object_pairs_hook=OrderedDict)
                logger.info('State loaded: %s' %
                            os.path.abspath(self.state_file_path))

        with open(self.config_file, 'r') as fh:
            self.config = json.load(fh, object_pairs_hook=OrderedDict)
            logger.info('Config loaded: %s' %
                        os.path.abspath(self.config_file))

            if 'since_id' in self.config:
                self.since_id = self.config['since_id']

            # Process resources - randomized
            if 'res' in self.config:
                res_tmp = self.config['res']
                random.shuffle(res_tmp)
                for idx, res in enumerate(res_tmp):
                    r = AccessResource(usr=res['usr'],
                                       token=res['token'],
                                       idx=idx)
                    self.resources_list.append(r)
                    self.resources_queue.put(r)
                    logger.info('Resource %02d loaded: %s' % (idx, r.usr))
            else:
                # unauth
                r = AccessResource(usr=None, token=None)
                self.resources_list.append(r)
                self.resources_queue.put(r)

    def init_db(self):
        """
        Initializes database engine & session.
        Has to be done on main thread.
        :return:
        """
        self.db_config = databaseutils.process_db_config(self.config['db'])

        from sqlalchemy import create_engine
        from sqlalchemy.orm import sessionmaker, scoped_session
        self.engine = create_engine(self.db_config.constr, pool_recycle=3600)
        self.session = scoped_session(sessionmaker(bind=self.engine))

        # Make sure tables are created
        DB_Base.metadata.create_all(self.engine)

    def init_workers(self):
        """
        Initialize worker threads
        :return:
        """
        logger.info('Starting %d working threads' % self.threads)
        for idx in range(self.threads):
            t = threading.Thread(target=self.work_thread_main, args=(idx, ))
            self.worker_threads.append(t)

        # Kick-off all threads
        for t in self.worker_threads:
            t.start()

        logger.info('Worker threads started')

    def cli(self):
        """
        CLI thread
        :return:
        """
        logger.info('CLI thread started')
        self.cmdloop()
        logger.info('Terminating CLI thread')

    #
    # Operation
    #

    def work(self):
        """
        Main thread work method
        :return:
        """
        # Interrupt signals
        signal.signal(signal.SIGINT, self.signal_handler)

        self.init_config()
        self.init_db()

        # Resume last state
        self.state_resume()

        # Monitor threads.
        self.state_thread = threading.Thread(target=self.state_main, args=())
        self.state_thread.start()

        # If there is no link to process - create from since.
        if self.link_queue.qsize() == 0:
            self.kickoff_links()

        # Worker threads
        self.init_workers()

        logger.info('Main thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))

        # CLI thread
        cli_thread = threading.Thread(target=self.cli, args=())
        cli_thread.setDaemon(True)
        cli_thread.start()

        # Join on workers
        self.after_loop()
        logger.info('Terminating main thread')
        return None

    def after_loop(self, wait_for_state=True):
        """
        After work loop finishes
        :return:
        """
        logger.info('Waiting termination of slave threads')

        # Wait here for termination of all workers and monitors.
        try:
            for t in self.worker_threads:
                t.join()

            if wait_for_state:
                self.state_thread.join()
        except:
            logger.error('Exception during thread join')
            logger.error(traceback.format_exc())

        logger.info('All threads terminates, last state save')
        self.state_save()

    def work_thread_main(self, idx):
        """
        Worker thread main loop
        :return:
        """
        self.local_data.idx = idx
        logger.info('Working thread %d started' % idx)

        while not self.terminate and not self.stop_event.is_set():
            self.interruptible_sleep_delta(0.1)

            # Get credential to process link with
            resource = self.resource_allocate()
            if resource is None:
                continue

            # We have resource, now get the job
            job = None
            try:
                job = self.link_queue.get(True, timeout=1.0)
            except Queue.Empty:
                self.resource_return(resource)
                continue

            # If job last fail is too recent - put again back to queue
            if time.time() - job.last_fail < 3.0:
                self.link_queue.put(
                    job
                )  # re-insert to the back of the queue for later processing
                self.resource_return(resource)
                continue

            # Job processing starts here - fetch data page with the resource.
            js_data = None
            try:
                self.local_data.job = job
                self.local_data.resource = resource
                js_data, headers, raw_response = self.load_page_local()

            except RateLimitHit as e:
                logger.error(
                    '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))
                continue

            except Exception as e:
                logger.error(
                    '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))

                self.on_job_failed(job)
                continue

            finally:
                self.resource_return(resource)
                self.local_data.resource = None
                self.local_data.last_usr = resource.usr
                self.local_data.last_remaining = resource.remaining

            # Process downloaded data here.
            try:
                if js_data is None:
                    self.audit_log('404', job.url, jtype=job.type, job=job)
                    self.flush_audit()
                    continue

                if job.type == DownloadJob.TYPE_USERS:
                    self.process_users_data(job, js_data, headers,
                                            raw_response)
                else:
                    self.process_keys_data(job, js_data, headers, raw_response)

            except Exception as e:
                logger.error(
                    '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s'
                    % (idx, job.type, job.url, job.fail_cnt, resource.usr, e))

                self.trace_logger.log(e)
                self.on_job_failed(job)
            finally:
                self.local_data.resource = None
                self.local_data.job = None
                self.local_data.last_usr = None
                self.local_data.last_remaining = None
                resource = None
                job = None
                headers = None
                raw_response = None

        pass
        logger.info('Terminating worker thread %d' % idx)

    def on_job_failed(self, job):
        """
        If job failed, this teaches it how to behave
        :param job:
        :return:
        """
        job.fail_cnt += 1
        job.last_fail = time.time()

        # if failed too many times - log and discard.
        if job.fail_cnt > 35:
            logger.warning('Job failed too many times %s' % job.url)
            self.audit_log('too-many-fails', job.url, jtype=job.type, job=job)
            self.flush_audit()
        else:
            self.link_queue.put(
                job)  # re-insert to the queue for later processing

    def load_page_local(self):
        """
        Loads page stored in thread local
        :return:
        """

        auth = None
        resource = self.local_data.resource
        if resource.usr is not None:
            auth = HTTPBasicAuth(resource.usr, resource.token)

        job = self.local_data.job

        res = requests.get(job.url, timeout=10, auth=auth)
        headers = res.headers

        resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset'))
        resource.remaining = utils.try_int(
            headers.get('X-RateLimit-Remaining'))
        resource.last_used = time.time()
        resource.used_cnt += 1

        if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10:
            resource.fail_cnt += 1
            raise RateLimitHit

        if res.status_code == 404:
            resource.fail_cnt += 1
            logger.warning('URL not found: %s' % job.url)
            return None, None, None

        if res.status_code // 100 != 2:
            resource.fail_cnt += 1
            res.raise_for_status()

        data = res.content
        if data is None:
            resource.fail_cnt += 1
            raise Exception('Empty response')

        js = json.loads(data, object_pairs_hook=OrderedDict)
        return js, headers, res

    def resource_allocate(self, blocking=True, timeout=1.0):
        """
        Takes resource from the pool.
        If the resource has low remaining credit, thread is suspended to re-charge.
        :return: resource or None if not available in the time
        """
        try:
            resource = self.resources_queue.get(True, timeout=1.0)
            if resource.remaining is not None and resource.remaining <= self.threads + 2:
                sleep_sec = resource.reset_time - time.time(
                ) if resource.reset_time is not None else 15 * 60
                sleep_sec += 120  # extra 2 minutes to avoid problems with resources

                logger.info(
                    'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, '
                    '%d minutes' %
                    (resource.usr, resource.remaining, resource.reset_time,
                     sleep_sec, sleep_sec / 60.0))
                self.sleep_interruptible(time.time() + sleep_sec)
                logger.info('Resource sleep finished %s' % resource.usr)

                # Reset estimations, needs to be refreshed
                resource.remaining = None
                resource.reset_time = None

            return resource

        except Queue.Empty:
            return None

    def resource_return(self, res):
        """
        Returns resource to the pool
        :param res:
        :return:
        """
        self.resources_queue.put(res)

    def sleep_interruptible(self, until_time):
        """
        Interruptible sleep - sleep until given time.
        :param until_time:
        :return:
        """
        while time.time() <= until_time:
            time.sleep(1.0)
            if self.terminate or self.stop_event.is_set():
                return

    def interruptible_sleep_delta(self, sleep_time):
        """
        Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread.
        :param sleep_time:
        :return:
        """
        if sleep_time is None:
            return

        sleep_time = float(sleep_time)

        if sleep_time == 0:
            return

        sleep_start = time.time()
        while not self.stop_event.is_set() and not self.terminate:
            time.sleep(0.1)
            if time.time() - sleep_start >= sleep_time:
                return

    #
    # Parser and processing logic
    #

    def kickoff_links(self):
        """
        Kick off the scrapping by adding initial links to the queue
        :return:
        """
        if self.update_keys:
            self.fill_user_key_links()

        else:
            job = DownloadJob(url=self.USERS_URL % self.since_id,
                              jtype=DownloadJob.TYPE_USERS)
            self.link_queue.put(job)
            logger.info('Kickoff link added: %s' % job.url)

    def process_users_data(self, job, js, headers, raw_response):
        """
        Process user data - produce keys links + next user link
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        max_id = 0
        github_users = []
        cur_time = int(time.time())
        for user in js:
            if 'id' not in user:
                logger.error('Field ID not found in user')
                continue

            github_user = GitHubUser(user_id=int(user['id']),
                                     user_name=user['login'],
                                     user_type=user['type'],
                                     user_url=user['url'])
            github_users.append(github_user)

            if github_user.user_id > max_id:
                max_id = github_user.user_id

            if self.users_only:
                continue

            key_url = '%s/keys' % github_user.user_url
            new_job = DownloadJob(url=key_url,
                                  jtype=DownloadJob.TYPE_KEYS,
                                  user=github_user,
                                  priority=random.randint(0, 1000),
                                  time_added=cur_time)
            self.link_queue.put(new_job)

        # Link with the maximal user id
        users_url = self.USERS_URL % max_id
        new_job = DownloadJob(url=users_url,
                              jtype=DownloadJob.TYPE_USERS,
                              time_added=cur_time)

        # Optimizing the position of this link in the link queue
        queue_size = self.link_queue.qsize()
        queue_size_max = self.LINK_FACTOR * self.threads
        fill_up_ratio = queue_size / float(queue_size_max)

        # Key jobs are uniformly distributed on priorities 0...1000.
        # To increase queue size pick priority closer to 1000, do decrease, closer to 0
        priority = random.randint(0, 500)
        if queue_size < queue_size_max:
            priority = int((1 - fill_up_ratio) * 5000) + 500
        if queue_size > 3 * queue_size_max:
            priority = 0

        new_job.priority = priority
        lucky_one = False
        with self.user_lock:
            if self.since_id < max_id:
                self.since_id = max_id
                self.link_queue.put(new_job)
                lucky_one = True

                # Bulk user optimisation - add more users at once, multithreading
                if self.users_only:
                    self.bulk_user_only_load(max_id=max_id,
                                             cur_time=cur_time,
                                             priority=priority)

        logger.info(
            '[%02d, usr=%20s, remaining=%5s] Processed users link %s, Next since: %3s. ResQSize: %4d, '
            'LQSize: %4d, fill-up: %0.4f, priority: %4s, ram: %s kB, new=%s, New users: [%s]'
            % (self.local_data.idx, self.local_data.last_usr,
               self.local_data.last_remaining, len(github_users) + 1, max_id,
               self.resources_queue.qsize(), queue_size, fill_up_ratio,
               priority, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
               lucky_one, ', '.join([str(x.user_name) for x in github_users])))

        # Store all users.
        self.store_users_list(github_users)

    def bulk_user_only_load(self, max_id, cur_time, priority):
        """
        Fills the queue with multiple user links
        :param max_id:
        :param cur_time:
        :param priority:
        :return:
        """
        new_jobs = []
        for page_idx in range(1, self.users_bulk_load_pages + 1):
            users_url = self.USERS_URL % (max_id +
                                          self.users_per_page * page_idx)
            new_job = DownloadJob(url=users_url,
                                  jtype=DownloadJob.TYPE_USERS,
                                  time_added=cur_time,
                                  priority=priority - page_idx)
            new_jobs.append(new_job)

        self.since_id = max_id + self.users_per_page * self.users_bulk_load_pages
        logger.info('[%02d] Bulk load, new max=%s' %
                    (self.local_data.idx, self.since_id))

        for job in new_jobs:
            self.link_queue.put(job)

    def store_users_list(self, users):
        """
        Stores all user in the list
        :param users
        :return:
        """
        # Handling gaps in the user space ID. With user-only optimization it causes
        # overlaps.
        reduced_by = 0
        with self.processed_user_set_lock:
            ids = [user.user_id for user in users]
            ids_ok = []
            for id in ids:
                if id in self.processed_user_set:
                    reduced_by += 1
                    continue
                self.processed_user_set.add(id)
                ids_ok.append(id)
            users = [user for user in users if user.user_id in ids_ok]

        # Bulk user load
        s = self.session()
        id_list = sorted([user.user_id for user in users])
        db_users = s.query(GitHubUserDb).filter(
            GitHubUserDb.id.in_(id_list)).all()
        db_user_map = {user.id: user for user in db_users}

        for user in users:
            self.new_users_events.insert()

            # Store user to the DB
            try:
                db_user = utils.defvalkey(db_user_map, key=user.user_id)
                self.store_user(user, s, db_user=db_user, db_user_loaded=True)

            except Exception as e:
                logger.warning('[%02d] Exception in storing user %s' %
                               (self.local_data.idx, e))
                self.trace_logger.log(e)
                logger.info('[%02d] idlist: %s' %
                            (self.local_data.idx, id_list))
                self.trigger_quit()
                break

        try:
            s.commit()
            # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by))
        except Exception as e:
            logger.warning('[%02d] Exception in storing bulk users' %
                           self.local_data.idx)
            logger.warning(traceback.format_exc())
            logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list))
            self.trigger_quit()
        finally:
            utils.silent_close(s)

    def process_keys_data(self, job, js, headers, raw_response):
        """
        Processing key loaded data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        js_empty = js is None or len(js) == 0

        # Expect failures, commit everything before
        if self.merge and not js_empty:
            try:
                s = self.session()
                s.commit()
            except Exception as e:
                logger.warning('Could not pre-commit: %s' % e)

        # Store each key.
        key_ids = []
        for key in js:
            s = None
            self.new_keys_events.insert()

            try:
                s = self.session()
                self.store_key(job.user, key, s)
                key_ids.append(int(key['id']))
                s.commit()

                self.assoc_key(job.user.user_id, key['id'], s)
                s.commit()

                s.flush()  # writes changes to DB
                s.expunge_all()  # removes objects from session

            except Exception as e:
                logger.warning('Exception in storing key %s' % e)
                self.trace_logger.log(e)

            finally:
                utils.silent_close(s)
                s = None

        # Deassoc lost keys
        try:
            s = self.session()
            self.deassoc_lost_keys(job.user.user_id, key_ids, s)
            s.commit()

        except Exception as e:
            logger.warning('Exception in deassoc for users %s : %s' %
                           (job.user.user_id, e))
            self.trace_logger.log(e)

        finally:
            utils.silent_close(s)
            s = None

        self.on_keys_processed()

    def on_keys_processed(self):
        """
        Event called on keys have been processed.
        Currently used to re-fill the link queue in the update key scenario.
        :return:
        """
        if not self.update_keys:
            return

        with self.user_refill_lock:
            qsize = self.link_queue.qsize()
            if qsize < 30:
                self.fill_user_key_links()

    def store_user(self, user, s, db_user=None, db_user_loaded=False):
        """
        Stores username to the database.
        :param user:
        :return:
        """
        type_id = 0
        if user.user_type == 'User':
            type_id = 1
        elif user.user_type == 'Organization':
            type_id = 2

        try:
            if not db_user_loaded:
                db_user = s.query(GitHubUserDb).filter(
                    GitHubUserDb.id == user.user_id).one_or_none()
            if db_user is not None:
                db_user.date_last_check = salch.func.now()
                db_user.usr_type = type_id
                s.merge(db_user)
                return 0

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('User query problem: %s' % e)

        # Store a new user here
        try:
            db_user = GitHubUserDb()
            db_user.id = user.user_id
            db_user.username = user.user_name
            db_user.usr_type = type_id
            s.add(db_user)
            return 0

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('[%02d] Exception during user store: %s' %
                           (self.local_data.idx, e))
            if db_user_loaded:
                raise
            return 1

    def load_existing_key(self, key, s):
        """
        Loads existing key if exists
        :param key:
        :param s:
        :return:
        """
        key_id = int(key['id'])
        return s.query(GitHubKey).filter(GitHubKey.id == key_id).one_or_none()

    def store_key(self, user, key, s):
        """
        Stores user key to the database.
        :param user:
        :param key:
        :param s: current DB session
        :return:
        """

        # Loading phase
        existing_key = None
        try:
            if self.merge or self.update_keys:
                existing_key = self.load_existing_key(key, s)

        except Exception as e:
            logger.warning('Exception: %s' % e)

        # Storing phase
        try:
            if existing_key is not None:
                existing_key.date_last_check = salch.func.now()
                s.merge(existing_key)
                return existing_key

            key_id = int(key['id'])
            key_raw = key['key']

            key_type, key_val = [utils.strip(x) for x in key_raw.split(' ', 1)]

            db_key = GitHubKey()
            db_key.id = key_id
            db_key.key_id = key_id
            db_key.key_type = key_type
            db_key.key_user_found = user.user_name
            db_key.key_user_id_found = user.user_id
            db_key.text_raw = key_raw

            if key_type == 'ssh-rsa':
                try:
                    key_obj = utils.load_ssh_pubkey(key_raw)
                    if isinstance(key_obj, RSAPublicKey):
                        db_key.key_size = key_obj.key_size
                        numbers = key_obj.public_numbers()
                        db_key.key_modulus_hex = '%x' % numbers.n
                        db_key.key_exponent = numbers.e
                except Exception as e:
                    logger.info('Exception during processing the key[%s]: %s' %
                                (key_type, e))

            s.add(db_key)
            return db_key

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key store: %s' % e)
            return 1

    def assoc_key(self, user_id, key_id, s):
        """
        Association user <-> key
        :param user_id:
        :param key_id:
        :param s:
        :return:
        """
        try:
            uassoc = None
            if self.merge:
                uassoc = s.query(GitHubUserKeys)\
                    .filter(GitHubUserKeys.user_id == user_id)\
                    .filter(GitHubUserKeys.key_id == key_id)\
                    .one_or_none()

            if uassoc is not None:
                uassoc.lost_at = None
                return

            if uassoc is None:
                uassoc = GitHubUserKeys()
                uassoc.user_id = user_id
                uassoc.key_id = key_id
                uassoc.fount_at = salch.func.now()
                uassoc.lost_at = None
                s.add(uassoc)

            return 0

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key assoc: %s' % e)
            return 1

    def deassoc_lost_keys(self, user_id, key_id, s):
        """
        Mark all lost keys as lost for the given user (not in key_id)
        :param user_id:
        :param key_id:
        :param s:
        :return:
        """
        try:
            q = s.query(GitHubUserKeys)\
                .filter(GitHubUserKeys.user_id == user_id)

            if len(key_id) > 0:
                q = q.filter(GitHubUserKeys.key_id.notin_(key_id))

            q.update({GitHubUserKeys.lost_at: salch.func.now()},
                     synchronize_session='fetch')
            return 0

        except Exception as e:
            utils.silent_rollback(s)
            logger.warning('Exception during key deassoc: %s' % e)
            return 1

    def flush_state(self):
        """
        Flushes state/config to the state file
        :return:
        """
        self.state['since_id'] = self.since_id
        self.state['rate_limit_remaining'] = self.rate_limit_remaining
        self.state['rate_limit_reset'] = self.rate_limit_reset
        utils.flush_json(self.state, self.state_file_path)

    #
    # DB
    #

    def fill_user_key_links(self):
        """
        Loads next X users from the database, advances since_id
        :return:
        """
        # self.since_id
        s = self.session()
        try:
            db_users = s.query(GitHubUserDb)\
                .filter(GitHubUserDb.id > self.since_id)\
                .order_by(GitHubUserDb.id)\
                .limit(self.user_load_bulk)\
                .all()

            for user in db_users:
                key_url = self.KEYS_ID_URL % user.id
                github_user = GitHubUser(user_id=user.id,
                                         user_name=user.username,
                                         user_type=user.usr_type,
                                         user_url=self.USER_URL %
                                         user.username)
                new_job = DownloadJob(url=key_url,
                                      jtype=DownloadJob.TYPE_KEYS,
                                      user=github_user,
                                      priority=random.randint(0, 1000),
                                      time_added=time.time())
                self.link_queue.put(new_job)

                if user.id > self.since_id:
                    self.since_id = user.id

        except Exception as e:
            logger.warning('Exception in loading users: %s' % e)
            utils.silent_rollback(s)

        finally:
            utils.silent_close(s)

    #
    # Auditing - errors, problems for further analysis
    #

    def audit_log(self, evt=None, link=None, jtype=None, job=None):
        """
        Appends audit log to the buffer. Lock protected.
        :param evt:
        :param link:
        :return:
        """
        log = collections.OrderedDict()
        log['time'] = time.time()
        log['evt'] = evt
        log['jtype'] = jtype
        log['link'] = link

        if job is not None and isinstance(job, DownloadJob):
            log['job'] = job.to_json()

        with self.audit_lock:
            self.audit_records_buffered.append(log)

    def flush_audit(self):
        """
        Flushes audit logs to the JSON append only file.
        Routine protected by the lock (no new audit record can be inserted while holding the lock)
        :return:
        """
        if self.audit_file is None:
            self.audit_records_buffered = []
            return

        self.audit_lock.acquire()
        try:
            if len(self.audit_records_buffered) == 0:
                return
            with open(self.audit_file, 'a') as fa:
                for x in self.audit_records_buffered:
                    fa.write(json.dumps(x) + "\n")
            self.audit_records_buffered = []
        except Exception as e:
            logger.error('Exception in audit log dump %s' % e)
        finally:
            self.audit_lock.release()

    #
    # State save / resume
    #

    def state_main(self):
        """
        State thread - periodical dump of the queues.
        :return:
        """
        logger.info('State thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))
        try:
            while not self.stop_event.is_set() and not self.terminate:
                # Dump stats each x seconds
                # Sleep is here because of dumping the state for the last time just before program quits.
                self.interruptible_sleep_delta(2)
                self.state_save()

                # Check memory conditions
                self.state_ram_check()

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s' % e)

        finally:
            pass

        logger.info('State loop terminated')

    def state_ram_check(self):
        """
        Checks memory terminating conditions
        :return:
        """

        if self.max_mem is None:
            return

        cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if cur_ram <= self.max_mem:
            return

        logger.warning(
            'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB'
            % (cur_ram, cur_ram / 1024.0, self.max_mem))
        self.trigger_stop()

    def state_gen(self):
        """
        Dumps state
        :return:
        """
        try:
            js_q = collections.OrderedDict()
            js_q['gen'] = time.time()
            js_q['link_size'] = self.link_queue.qsize()
            js_q['since_id'] = self.since_id
            js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

            # Dequeues
            self.new_users_events.maintain()
            self.new_keys_events.maintain()

            users_in_5min = self.new_users_events.under_limit(5 * 60)
            keys_in_5min = self.new_keys_events.under_limit(5 * 60)

            js_q['users_dequeue_size'] = self.new_users_events.len()
            js_q['keys_dequeue_size'] = self.new_keys_events.len()
            js_q['users_5min'] = users_in_5min
            js_q['keys_5min'] = keys_in_5min
            js_q['users_1min'] = users_in_5min / 5.0
            js_q['keys_1min'] = keys_in_5min / 5.0

            # link queue structure
            qdata = list(self.link_queue.queue)
            qdata.sort(cmp=DownloadJob.cmp)
            js_q['link_structure'] = ''.join([
                '.' if x.type == DownloadJob.TYPE_KEYS else 'U' for x in qdata
            ])

            # Stats.
            js_q['resource_stats'] = [
                x.to_json() for x in list(self.resources_list)
            ]

            # Finally - the queue
            js_q['link_queue'] = [x.to_json() for x in qdata]
            return js_q

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s', e)

    def state_save(self):
        """
        saves the state
        :return:
        """
        try:
            js_q = self.state_gen()
            utils.flush_json(js_q, self.state_file_path)

        except Exception as e:
            self.trace_logger.log(e)
            logger.error('Exception in state: %s', e)

    def state_resume(self):
        """
        Attempts to resume the queues from the monitoring files
        :return:
        """
        try:
            if self.state is None:
                return

            if 'since_id' in self.state:
                self.since_id = self.state['since_id']

            if 'link_queue' in self.state:
                for rec in self.state['link_queue']:
                    job = DownloadJob.from_json(rec)
                    self.link_queue.put(job)
                logger.info('Link queue resumed, entries: %d' %
                            len(self.state['link_queue']))

        except Exception as e:
            self.trace_logger.log(e)
            logger.warning('Exception in resuming the state %s' % e)
            logger.error('State resume failed, exiting')
            sys.exit(1)

Example #12

Show file

File: test_pysam_all.py Project: skoeb/pysam

def test_functionality():
    n_tests_passed = 0
    round = 0

    while round < 50:  # multiple runs required to check for memory leaks
        if round == 0:
            tracker = SummaryTracker()

        round += 1

        a = GenericSystem.new()
        b = a.Plant

        # Test setting values with correct types

        a.Plant.derate = 1
        assert (a.Plant.derate == 1)
        assert (b.derate == 1)

        n_tests_passed += 1

        b.energy_output_array = (1, 2)
        assert (a.Plant.energy_output_array == (1, 2))
        assert (b.energy_output_array == (1, 2))
        n_tests_passed += 1

        # Test type checks with errors

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = 1
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = (1, "2")
        except:
            n_tests_passed += 1

        # Test assigning from dictionary

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        b.assign(PlantDict)
        assert (b.derate == 10)
        assert (b.energy_output_array == (10, 20))
        n_tests_passed += 1

        PlantDict = {'derate': 1, 'energy_output_array': (2, 2)}
        a.Plant.assign(PlantDict)
        assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2))
        n_tests_passed += 1

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ()
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ((12, 20), (1, 1))
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['derate'] = "derate"
            PlantDict['energy_output_array'] = (1, 2)
            c.Plant.assign(PlantDict)
        except:
            n_tests_passed += 1

        # exporting to dictionary

        ValDict = b.export()
        assert (ValDict['derate'] == 1
                and ValDict['energy_output_array'] == (2, 2))
        n_tests_passed += 1

        # Test shared module (AdjustmentFactors)
        d = a.AdjustmentFactors

        d.constant = 1
        assert (d.constant == 1)
        n_tests_passed += 1

        d.hourly = (1, 2)
        assert (d.hourly == (1, 2))
        n_tests_passed += 1

        d.periods = ((1, 2), (3, 4))
        assert (d.periods == ((1, 2), (3, 4)))
        n_tests_passed += 1

        try:
            d.periods = ((1, 2))
        except:
            n_tests_passed += 1

        ValDict = d.export()
        assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2)
                and ValDict['periods'] == ((1, 2), (3, 4)))
        n_tests_passed += 1

        ValDict = {
            'constant': 10,
            "hourly": (10, 20),
            "periods": ((10, 20), (30, 40))
        }
        d.assign(ValDict)
        assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20)
                and ValDict['periods'] == ((10, 20), (30, 40)))
        n_tests_passed += 1

        # Test nested dictionary assignment and export

        TechDict = {
            'Plant': {
                'derate': 100,
                'energy_output_array': (100, 200)
            },
            'AdjustmentFactors': {
                'constant': 100,
                "hourly": (100, 200),
                "periods": ((100, 200), (300, 400))
            }
        }
        a.assign(TechDict)
        ValDict = a.Plant.export()
        assert (ValDict['derate'] == 100
                and ValDict['energy_output_array'] == (100, 200))
        n_tests_passed += 1

        ValDict = a.AdjustmentFactors.export()
        assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200)
                and ValDict['periods'] == ((100, 200), (300, 400)))
        n_tests_passed += 1

        # Test reading from PySSC

        data = ssc.data_create()
        ssc.data_set_number(data, b'derate', 1000)
        ssc.data_set_array(data, b'energy_output_array', [1000, 2000])
        a = GenericSystem.wrap(data)
        assert (a.Plant.derate == 1000)
        assert (a.Plant.energy_output_array == (1000, 2000))

        # Test strings and tables with error cases
        import PySAM.Pvwattsv5 as Pvwattsv5
        a = Pvwattsv5.new()
        a.SolarResource.solar_resource_file = "file"
        assert (a.SolarResource.solar_resource_file == "file")
        n_tests_passed += 1

        assert (a.SolarResource.export()['solar_resource_file'] == 'file')
        n_tests_passed += 1

        c = Pvwattsv5.new()
        datDict = {
            'num': 1,
            'arr': (1, 2),
            'str': 'str',
            'mat': ((1, 2), (3, 4)),
            'table': {
                'yo': 0
            }
        }
        c.SolarResource.solar_resource_data = datDict
        DataDict = c.SolarResource.solar_resource_data
        assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2))
        assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0)))
        assert (DataDict['str'] == 'str')
        assert (DataDict['table'] == {'yo': 0})
        n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_file = 100
        except:
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_data = {
                'num': 1,
                'arr': (1, "2"),
                'mat': ((1, 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
        except:
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.SolarResource.solar_resource_data = {
                'num': 1,
                'arr': (1, 2),
                'mat': (("1", 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
        except:
            n_tests_passed += 1

        a.SolarResource.solar_resource_data = {
            'num': 1,
            'arr': (1, 2),
            'mat': ((1, 2), (3, 4)),
            'str': 'str',
            'table': {}
        }
        assert (a.SolarResource.solar_resource_data['table'] == {})
        n_tests_passed += 1

        # Test conversion between technology attributes and nested dictionary

        genDict = a.export()
        assert (
            genDict['SolarResource']['solar_resource_data']['str'] == 'str')
        n_tests_passed += 1

        a = Pvwattsv5.new()
        assert (a.export()['SolarResource'] == {})
        a.assign(genDict)
        assert (a.export() == genDict)
        n_tests_passed += 1

        # Test loading from serialized dict
        a = GenericSystem.default("GenericSystemNone")

        # Test `value` function
        a.value("derate", 1)
        assert (a.value("derate") == 1)

        a.value("energy_output_array", (0, 1, 2))
        assert (a.value("energy_output_array")[0] == 0)
        assert (a.value("energy_output_array")[1] == 1)
        assert (a.value("energy_output_array")[2] == 2)

        if round == 3:
            tracker.print_diff()

    tracker.print_diff()

Example #13

Show file

def test_functionality():
    n_tests_passed = 0
    round = 0

    while round < 50:  # multiple runs required to check for memory leaks
        if round == 0:
            tracker = SummaryTracker()

        print("\n===Starting Round", round, "===\n\n")
        round += 1

        a = GenericSystem.new()
        b = a.Plant

        # Test setting values with correct types

        a.Plant.derate = 1
        assert (a.Plant.derate == 1)
        assert (b.derate == 1)

        print("Passed test", 0)
        n_tests_passed += 1

        b.energy_output_array = (1, 2)
        assert (a.Plant.energy_output_array == (1, 2))
        assert (b.energy_output_array == (1, 2))
        print("Passed test", 1)
        n_tests_passed += 1

        # Test type checks with errors

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = 1
            print("FAIL 2: exception is expected")
        except:
            print("Passed test", 2)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            c.Plant.energy_output_array = (1, "2")
            print("FAIL 3: exception is expected")
        except:
            print("Passed test", 3)
            n_tests_passed += 1

        # Test assigning from dictionary

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        b.assign(PlantDict)
        assert (b.derate == 10)
        assert (b.energy_output_array == (10, 20))
        print("Passed test", 4)
        n_tests_passed += 1

        PlantDict = {'derate': 1, 'energy_output_array': (2, 2)}
        a.Plant.assign(PlantDict)
        assert (a.Plant.derate == 1 and a.Plant.energy_output_array == (2, 2))
        print("Passed test", 5)
        n_tests_passed += 1

        PlantDict = {'derate': 10, 'energy_output_array': (10, 20)}

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ()
            c.Plant.assign(PlantDict)
            print("FAIL 1: exception is expected")
        except:
            print("Error caught", 1)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['energy_output_array'] = ((12, 20), (1, 1))
            c.Plant.assign(PlantDict)
            print("FAIL 2: exception is expected")
        except:
            print("Error caught", 2)
            n_tests_passed += 1

        try:
            c = GenericSystem.new()
            PlantDict['derate'] = "derate"
            PlantDict['energy_output_array'] = (1, 2)
            c.Plant.assign(PlantDict)
            print("FAIL 3: exception is expected")
        except:
            print("Error caught", 3)
            n_tests_passed += 1

        # exporting to dictionary

        ValDict = b.export()
        assert (ValDict['derate'] == 1
                and ValDict['energy_output_array'] == (2, 2))
        print("Passed test", 6)
        n_tests_passed += 1

        # Test shared module (AdjustmentFactors)
        d = a.AdjustmentFactors

        d.constant = 1
        assert (d.constant == 1)
        print("Passed test", 7)
        n_tests_passed += 1

        d.hourly = (1, 2)
        assert (d.hourly == (1, 2))
        print("Passed test", 8)
        n_tests_passed += 1

        d.periods = ((1, 2), (3, 4))
        assert (d.periods == ((1, 2), (3, 4)))
        print("Passed test", 9)
        n_tests_passed += 1

        try:
            d.periods = ((1, 2))
            print("FAIL 4: exception is expected")
        except:
            print("Error caught", 4)
            n_tests_passed += 1

        ValDict = d.export()
        assert (ValDict['constant'] == 1 and ValDict['hourly'] == (1, 2)
                and ValDict['periods'] == ((1, 2), (3, 4)))
        print("Passed test", 10)
        n_tests_passed += 1

        ValDict = {
            'constant': 10,
            "hourly": (10, 20),
            "periods": ((10, 20), (30, 40))
        }
        d.assign(ValDict)
        assert (ValDict['constant'] == 10 and ValDict['hourly'] == (10, 20)
                and ValDict['periods'] == ((10, 20), (30, 40)))
        print("Passed test", 11)
        n_tests_passed += 1

        # Test nested dictionary assignment and export

        TechDict = {
            'Plant': {
                'derate': 100,
                'energy_output_array': (100, 200)
            },
            'AdjustmentFactors': {
                'constant': 100,
                "hourly": (100, 200),
                "periods": ((100, 200), (300, 400))
            }
        }
        a.assign(TechDict)
        ValDict = a.Plant.export()
        assert (ValDict['derate'] == 100
                and ValDict['energy_output_array'] == (100, 200))
        print("Passed test", 12)
        n_tests_passed += 1

        ValDict = a.AdjustmentFactors.export()
        assert (ValDict['constant'] == 100 and ValDict['hourly'] == (100, 200)
                and ValDict['periods'] == ((100, 200), (300, 400)))
        print("Passed test", 13)
        n_tests_passed += 1

        # Test reading from PySSC

        data = ssc.data_create()
        ssc.data_set_number(data, b'derate', 1000)
        ssc.data_set_array(data, b'energy_output_array', [1000, 2000])
        a = GenericSystem.wrap(data)
        assert (a.Plant.derate == 1000)
        assert (a.Plant.energy_output_array == (1000, 2000))

        # Test strings and tables with error cases
        import PySAM.Pvwattsv5 as Pvwattsv5
        a = Pvwattsv5.new()
        a.LocationAndResource.solar_resource_file = "file"
        assert (a.LocationAndResource.solar_resource_file == "file")
        print("Passed test", 14)
        n_tests_passed += 1

        assert (
            a.LocationAndResource.export()['solar_resource_file'] == 'file')
        print("Passed test", 15)
        n_tests_passed += 1

        c = Pvwattsv5.new()
        datDict = {
            'num': 1,
            'arr': (1, 2),
            'str': 'str',
            'mat': ((1, 2), (3, 4)),
            'table': {
                'yo': 0
            }
        }
        c.LocationAndResource.solar_resource_data = datDict
        DataDict = c.LocationAndResource.solar_resource_data
        assert (DataDict['num'] == 1 and DataDict['arr'] == (1, 2))
        assert (DataDict['mat'] == ((1.0, 2.0), (3.0, 4.0)))
        assert (DataDict['str'] == 'str')
        assert (DataDict['table'] == {'yo': 0})
        print("Passed test", 16)
        n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_file = 100
            print("FAIL 5: exception is expected")
        except:
            print("Error caught", 5)
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_data = {
                'num': 1,
                'arr': (1, "2"),
                'mat': ((1, 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
            print("FAIL 6: exception is expected")
        except:
            print("Error caught", 6)
            n_tests_passed += 1

        try:
            c = Pvwattsv5.new()
            c.LocationAndResource.solar_resource_data = {
                'num': 1,
                'arr': (1, 2),
                'mat': (("1", 2), (3, 4)),
                'str': 'str',
                'table': {
                    'yo': 0
                }
            }
            print("FAIL 7: exception is expected")
        except:
            print("Error caught", 7)
            n_tests_passed += 1

        a.LocationAndResource.solar_resource_data = {
            'num': 1,
            'arr': (1, 2),
            'mat': ((1, 2), (3, 4)),
            'str': 'str',
            'table': {}
        }
        assert (a.LocationAndResource.solar_resource_data['table'] == {})
        print("Passed test", 17)
        n_tests_passed += 1

        # Test conversion between technology attributes and nested dictionary

        genDict = a.export()
        assert (genDict['LocationAndResource']['solar_resource_data']['str'] ==
                'str')
        print("Passed test", 18)
        n_tests_passed += 1

        a = Pvwattsv5.new()
        assert (a.export()['LocationAndResource'] == {})
        a.assign(genDict)
        assert (a.export() == genDict)
        print("Passed test", 19)
        n_tests_passed += 1

        # Test loading from serialized dict
        a = GenericSystem.default("GenericSystemNone")
        print(a.export())

        if round == 3:
            tracker.print_diff()

        # execution
        # a.execute(1)

    tracker.print_diff()

Example #14

Show file

File: eRisk_T2_runs_testT2.py Project: WaleedRagheb/tmv-dmem

def eRisk_testT2_runs_2(writingDir, DecisionDir):
    PATH = Path(writingDir)
    CLAS_PATH = 'data/nlp_clas/eRisk_dep/eRisk_dep_clas'

    CLASSES = ['neg', 'pos']

    lm_id = 'eRisk_dep4_FT'
    clas_id = 'eRisk_dep4_2_BEST'

    LM_PATH = "data/nlp_clas/eRisk_dep/eRisk_dep_lm"

    val_texts, subjIDs, chunkIDs, wrIDs = get_texts_final(PATH)

    df_val_ALL = pd.DataFrame({
        'text': val_texts,
        'chunk_No': list(map(int, chunkIDs)),
        'wr_No': list(map(int, wrIDs)),
        'subj_ID': subjIDs
    })
    df_val_ALL = df_val_ALL.sort_values(by=['subj_ID', 'chunk_No', 'wr_No'],
                                        ascending=[True, True, True])
    subjIDs_unq = set(subjIDs)

    df_val_ALL_proc = df_val_ALL[0:0]
    for sId in subjIDs_unq:
        #print(sId)
        df_subj = df_val_ALL.loc[df_val_ALL['subj_ID'] == sId]
        df_subj_proc = ProcessAcc(df_subj, 5)
        df_val_ALL_proc = df_val_ALL_proc.append(df_subj_proc)

    print('Finish [ProcessAcc].....' + str(max(map(int, wrIDs))))

    val_texts = [
        it[0] for it in df_val_ALL_proc.loc[:, ['text']].values.tolist()
    ]
    #val_labels = [it[0] for it in df_val_ALL_proc.loc[:, ['labels']].values.tolist()]
    subjIDs = [
        it[0] for it in df_val_ALL_proc.loc[:, ['subj_ID']].values.tolist()
    ]
    chunkIDs = [
        it[0] for it in df_val_ALL_proc.loc[:, ['chunk_No']].values.tolist()
    ]
    wrIDs = [it[0] for it in df_val_ALL_proc.loc[:, ['wr_No']].values.tolist()]

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy', val_texts)
    ##np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy', val_labels)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy', subjIDs)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy', chunkIDs)
    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy', wrIDs)
    #
    #val_texts = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'val_texts.npy')
    ##val_labels = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'val_labels.npy')
    #subjIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'subjIDs.npy')
    #chunkIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'chunkIDs.npy')
    #wrIDs = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'wrIDs.npy')
    #
    col_names = ['labels', 'text']

    df_val = pd.DataFrame({
        0: np.zeros(len(val_texts)),
        1: val_texts
    })  # columns=col_names)
    #df_val = pd.DataFrame({'text':val_texts, 'labels':np.zeros(len(val_texts))}, columns=col_names)
    #writeToCSV_oneFiled(df_val, CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/test_Real1_wr.csv')
    #
    #
    ##chunksize = 24000
    #chunksize = 24000
    #
    #
    ##df_val1 = pd.read_csv(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'test_Real1_wr.csv', header=None, chunksize=chunksize, engine='python')#,names=(range(227)))

    #tok_val, val_labels = get_all_eRisk(df_val1, 1)

    gc.collect()

    tracker = SummaryTracker()
    tracker.print_diff()

    tok_val, val_labels = get_texts_eRisk(df_val, 1)

    tracker = SummaryTracker()
    tracker.print_diff()

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy', tok_val)

    #np.save(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'dummy'+'/'+'lbl_val.npy', val_labels)

    #tok_val = np.load(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'test'+'/'+'tok_val.npy')
    itos = pickle.load(open(LM_PATH + '/' + 'tmp' + '/' + 'itos.pkl', 'rb'))
    stoi = collections.defaultdict(lambda: 0,
                                   {v: k
                                    for k, v in enumerate(itos)})
    len(itos)

    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])

    np.save(
        CLAS_PATH + '/' + 'tmp' + '/' + 'tests' + '/' + 'testT2' + '/' +
        'val_ids.npy', val_clas)

    #create_bw_data_test(CLAS_PATH+'/'+'tmp'+'/'+'tests'+'/'+'testT2'+'/', LM_PATH+'/'+'tmp'+'/'+'itos.pkl')

    print('Start running the models .... ')

    prediction, samplIdx, pos_prob = eval_clas_eRisk_final_3(str(CLAS_PATH),
                                                             2,
                                                             lm_id=lm_id,
                                                             clas_id=clas_id,
                                                             attention=False)
    #prediction_bw, samplIdx_bw, pos_prob_bw = eval_clas_eRisk_final_3(str(CLAS_PATH), 2, lm_id=lm_id, clas_id=clas_id, attention=False, backwards=True)

    #prediction = [1 if (it_fw+it_bw)/2 > 0.5 else 0 for it_fw, it_bw in zip(prediction_fw, prediction_bw)]
    #samplIdx = samplIdx_fw
    #pos_prob = [(it_fw+it_bw)/2 for it_fw, it_bw in zip(pos_prob_fw, pos_prob_bw)]

    tracker = SummaryTracker()
    tracker.print_diff()

    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy', prediction)
    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy', samplIdx)
    #np.save(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy', pos_prob)
    #
    #prediction = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'prediction_eRisk_dep_wr_ens.npy')
    #samplIdx = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'samplIdx_eRisk_dep_wr_ens.npy')
    #pos_prob = np.load(str(CLAS_PATH)+'/'+'tmp/tests/test'+'/'+'pos_prob_eRisk_dep_wr_ens.npy')
    #

    subjIDs = [subjIDs[i] for i in samplIdx]
    #val_labels = [val_labels[i] for i in samplIdx]
    chunkIDs = [chunkIDs[i] for i in samplIdx]
    wrIDs = [wrIDs[i] for i in samplIdx]

    uniq_subID = list(set(subjIDs))
    #results_pred = [[ None ] * 10] * len(uniq_subID)
    results_pred = [[0] * 10 for _ in range(len(uniq_subID))]
    results_pred_thr = [[0] * 10 for _ in range(len(uniq_subID))]
    totalWrs = [[0] * 10 for _ in range(len(uniq_subID))]
    #results_golden = [ None ] * len(uniq_subID)

    pos_prob_wr = [[] for _ in range(len(uniq_subID))]

    #checkIntegrity(subjIDs,val_labels)
    th = 0.9

    for sid, chNum, wr, pred, pos_p in zip(subjIDs, chunkIDs, wrIDs,
                                           prediction, pos_prob):
        idx = uniq_subID.index(sid)
        pos_prob_wr[idx].append(pos_p)
        results_pred[idx][int(chNum) -
                          1] = results_pred[idx][int(chNum) - 1] + pred

        if (pos_p >= th):
            results_pred_thr[idx][int(chNum) -
                                  1] = results_pred_thr[idx][int(chNum) -
                                                             1] + 1

        totalWrs[idx][int(chNum) - 1] = max(totalWrs[idx][int(chNum) - 1],
                                            int(wr))

    #res = [[res/total for res,total in zip(results_pred[i], totalWrs[i])] for i in range(len(results_pred))]

    #print("F1: " + str(f1_erisk(val_labels, prediction)))

    pos_p_avg = [sum(pb) / len(pb) for pb in pos_prob_wr]

    #for thr in np.arange(0.0, 1.0, 0.025):
    #    print(str(thr) + ":\t")
    #    prd = [1 if(it >= thr) else 0 for it in pos_p_avg]
    #    #print("F1: " + str(f1_erisk(prd, results_golden)))
    #    f1_erisk(prd, results_golden)
    #

    pred_count = [sum(rb) for rb in results_pred_thr]

    createDecisionFiles(uniq_subID, pred_count, pos_prob_wr, DecisionDir,
                        max(map(int, wrIDs)))
    gc.collect()

Example #15

Show file

File: tuclient.py Project: mlogic/tuclient

    def _start_session(self):
        if self._debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()
        if self._debugging_level >= 2:
            from pympler.tracker import SummaryTracker
            tracker = SummaryTracker()

        # GC could cause unplanned stall and disrupts precisely timed collection.
        # Disable it and do it manually before sleeping.
        gc.disable()
        if self._tick_len == 0:
            self._force_collect = True
        try:
            self._logger.info('Client node {node_name} started'.format(
                node_name=self._node_name))
            while not self._stopped:
                if self._status == ClientStatus.OFFLINE:
                    self._status = ClientStatus.HANDSHAKE1_AUTHENTICATING
                    # Handshake
                    self.timestamp_and_send_list([
                        ProtocolCode.KEY, self._api_secret_key,
                        self._cluster_name, self._node_name
                    ])
                    # Reset the timeout counter after sending out a command
                    self._last_received_ts = monotonic_time()
                    self._current_error_msg = 'Failed to connect to the gateway.'
                    self._logger.info(
                        'Client node {node_name} initiated handshaking. Step 1: authenticating...'
                        .format(node_name=self._node_name))

                if self._status == ClientStatus.ALL_OK:
                    if self._getters is not None and len(self._getters) > 0:
                        # We need to collect at a fix time in a second (self._collect_time_decimal) by
                        # the wall clock (so that the wall clock time will be saved to the database for
                        # other analyses), so we don't use monotonic clock (we tried using monotonic
                        # time with some complex calculation in the past but it was too buggy).
                        ts = time.time()
                        # Time backflow detection: because we are not using monotonic time, if the user
                        # turns back the wall time, the program might not do collection in a long time.
                        if self._last_collect_time - ts > self._tick_len > 0:
                            self._logger.info(
                                'Time was turned backwards. Doing a collection right now.'
                            )
                            self._last_collect_time = -1
                            self._update_next_collect_time()
                        # When self._tick_len > 0, we do a collect step at self._next_collect_time.
                        # When self._tick_len == 0, we wait until force_collect.
                        if (self._tick_len > 0 and ts >= self._next_collect_time) or \
                           (self._tick_len == 0 and self._force_collect):
                            self._logger.debug(
                                'Time is reached to do collection. Starting...'
                            )
                            # requested_collect_time will be sent to getters to indicate the time of
                            # data we want to collect. We get it before updating the next_collect_time.
                            requested_collect_time = self._next_collect_time
                            # last_collect_time and next_collect_time must be updated *before* collecting
                            # to prevent skipping of a collection if the collection is longer than a tick_len
                            if self._tick_len > 0:
                                self._last_collect_time = ts
                                self._update_next_collect_time()
                            else:
                                # When tick_len == 0, we have to use a increasing counter instead of
                                # the read collect time to prevent collision.
                                self._last_collect_time += 1
                                self._force_collect = False
                            pi_data = []
                            for g in self._getters:
                                self._logger.debug(
                                    "Starting to collect data from getter '{getter_name}'"
                                    .format(getter_name=g.name))
                                d = g.collect(self._tick_len,
                                              requested_collect_time)
                                if d is None or len(d) == 0:
                                    self._logger.warning(
                                        "Client node {node_name} getter '{getter_name}' did not return"
                                        " any data".format(
                                            node_name=self._node_name,
                                            getter_name=g.name))
                                else:
                                    self._logger.debug(
                                        "Collected data from getter '{getter_name}': {data}"
                                        .format(getter_name=g.name,
                                                data=str(d)))
                                    pi_data.extend(d)
                            if len(pi_data) == 0:
                                self._logger.info(
                                    'Client node {node_name} all getters return no data. Skipped sending.'
                                    .format(node_name=self._node_name))
                            else:
                                self._logger.debug(
                                    'Client node {node_name} collected from all getters: {pi_data}'
                                    .format(node_name=self._node_name,
                                            pi_data=str(pi_data)))

                                if self._tuning_goal_calculator is None:
                                    tuning_goal_payload = []
                                    self._logger.debug(
                                        'Client node {node_name} has no tuning goal'
                                        .format(node_name=self._node_name))
                                else:
                                    tuning_goal = self._tuning_goal_calculator.get_tuning_goal(
                                        pi_data)
                                    assert -1 <= tuning_goal <= 1
                                    tuning_goal_payload = [tuning_goal]
                                    self._logger.debug(
                                        'Client node {node_name} collected tuning goal: {tuning_goal}'
                                        .format(node_name=self._node_name,
                                                tuning_goal=str(tuning_goal)))

                                # First element of the outgoing list is tuning_goal
                                self.timestamp_and_send_list(
                                    [
                                        ProtocolCode.PI,
                                        tuning_goal_payload + pi_data
                                    ],
                                    ts=self._last_collect_time)
                                # We don't wait for 'OK' to save time
                        else:
                            self._logger.debug(
                                'Collection time is not reached yet')
                            pass
                    else:
                        self._logger.debug(
                            'No getter is set. Skipped collecting.')
                        self._last_collect_time = time.time()

                gc.collect()
                flush_log()

                # Print out memory usage every minute
                if self._debugging_level >= 2 and int(time.time()) % 60 == 0:
                    print('Time: ' + time.asctime(time.localtime(time.time())))
                    tracker.print_diff()

                if self._status == ClientStatus.ALL_OK and self._getters is not None and len(self._getters) > 0 and \
                        self._tick_len > 0:
                    # Calculate the precise time for next collection
                    sleep_second = max(self._next_collect_time - time.time(),
                                       0)
                else:
                    sleep_second = 1

                # We have to process all messages in the queue before doing next collection,
                # otherwise, a lengthy collection above could prevent the messages in msg_queue from
                # being processed.
                self._process_all_messages(sleep_second)

            if self.notify_gateway_on_stop:
                self.timestamp_and_send_list([ProtocolCode.CLIENT_STOP])
            self._logger.info('Client node {node_name} stopped'.format(
                node_name=self._node_name))
        finally:
            self._status = ClientStatus.OFFLINE
            gc.enable()

            if self._debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())

Example #16

Show file

 def do_mem_track_init(self, line):
     self.mem_tracker = SummaryTracker()

Example #17

Show file

class GitHubLoader(Cmd):
    """
    GitHub SSH keys loader
    """
    prompt = '$> '

    LINK_FACTOR = 70
    USERS_URL = 'https://api.github.com/users?since=%s'
    KEYS_URL = 'https://api.github.com/users/%s/keys'

    USER_DETAIL_URL = 'https://api.github.com/users/%s'
    USER_REPOS_URL = 'https://api.github.com/users/%s/repos'
    USER_ORGS_URL = 'https://api.github.com/users/%s/orgs'
    ORG_REPOS_URL = 'https://api.github.com/orgs/%s/repos'
    ORG_REPO_COLAB_URL = 'https://api.github.com/repos/%s/collaborators'
    ORG_REPO_ASSIGNEES_URL = 'https://api.github.com/repos/%s/assignees'

    def __init__(self,
                 attempts=5,
                 threads=1,
                 state=None,
                 state_file=None,
                 config_file=None,
                 audit_file=None,
                 max_mem=None,
                 *args,
                 **kwargs):

        Cmd.__init__(self, *args, **kwargs)
        self.t = Terminal()

        self.args = None
        self.attempts = int(attempts)
        self.total = None
        self.terminate = False

        self.last_users_count = None
        self.user_lock = Lock()
        self.processed_user_set = set()
        self.processed_user_set_lock = Lock()
        self.orgs_loaded_set = set()
        self.orgs_loaded_lock = Lock()

        self.max_mem = max_mem
        self.state = state
        self.state_file_path = state_file
        self.rate_limit_reset = None
        self.rate_limit_remaining = None

        self.config = None
        self.config_file = config_file

        self.audit_file = audit_file
        self.audit_records_buffered = []
        self.audit_lock = Lock()

        self.stop_event = threading.Event()
        self.threads = int(threads)
        self.link_queue = Queue.PriorityQueue()  # Store links to download here
        self.worker_threads = []

        self.state_thread = None
        self.state_thread_lock = Lock()

        self.resources_list = []
        self.resources_queue = Queue.PriorityQueue()
        self.local_data = threading.local()

        self.new_users_events = EvtDequeue()
        self.new_keys_events = EvtDequeue()

        self.db_config = None
        self.engine = None
        self.session = None

        self.mem_tracker = None

    def signal_handler(self, signal, frame):
        """
        Signal handler - terminate gracefully
        :param signal:
        :param frame:
        :return:
        """
        logger.info('CTRL+C pressed')
        self.trigger_stop()

    def trigger_stop(self):
        """
        Sets terminal conditions to true
        :return:
        """
        self.terminate = True
        self.stop_event.set()

    def trigger_quit(self):
        """
        Terminal condition & file change
        :return:
        """
        self.trigger_stop()
        utils.try_touch('.github-quit')

    def do_quit(self, arg):
        self.trigger_quit()
        logger.info('Waiting for thread termination')

        time.sleep(1)
        logger.info('Quitting')
        return Cmd.do_quit(self, arg)

    def do_reset(self, line):
        print('\033c')

    def do_gc(self, line):
        gc.collect()

    def do_mem_top(self, line):
        print(mem_top.mem_top())

    def do_mem_track_init(self, line):
        self.mem_tracker = SummaryTracker()

    def do_mem_track_diff(self, line):
        print(self.mem_tracker.print_diff())

    def do_mem_track_deinit(self, line):
        self.mem_tracker = None

    def do_mem(self, line):
        print('Memory usage: %s kB' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    def do_state(self, line):
        js = self.state_gen()

        if line is None or len(line) == 0:
            del js['link_queue']
            del js['resource_stats']
        elif line == '1':
            del js['link_queue']

        print(json.dumps(js, indent=2, cls=utils.AutoJSONEncoder))

    def do_deq_enable(self, line):
        self.new_keys_events.disabled = False
        self.new_users_events.disabled = False

    def do_deq_disable(self, line):
        self.new_keys_events.disabled = True
        self.new_users_events.disabled = True

    def init_config(self):
        """
        Loads config & state files
        :return:
        """
        if self.state_file_path is not None and os.path.exists(
                self.state_file_path):
            with open(self.state_file_path, 'r') as fh:
                self.state = json.load(fh, object_pairs_hook=OrderedDict)
                logger.info('State loaded: %s' %
                            os.path.abspath(self.state_file_path))

        with open(self.config_file, 'r') as fh:
            self.config = json.load(fh, object_pairs_hook=OrderedDict)
            logger.info('Config loaded: %s' %
                        os.path.abspath(self.config_file))

            # Process resources - randomized
            if 'res' in self.config:
                res_tmp = self.config['res']
                random.shuffle(res_tmp)
                for idx, res in enumerate(res_tmp):
                    r = AccessResource(usr=res['usr'],
                                       token=res['token'],
                                       idx=idx)
                    self.resources_list.append(r)
                    self.resources_queue.put(r)
                    logger.info('Resource %02d loaded: %s' % (idx, r.usr))
            else:
                # unauth
                r = AccessResource(usr=None, token=None)
                self.resources_list.append(r)
                self.resources_queue.put(r)

    def init_db(self):
        """
        Initializes database engine & session.
        Has to be done on main thread.
        :return:
        """
        self.db_config = databaseutils.process_db_config(self.config['db'])

        from sqlalchemy import create_engine
        from sqlalchemy.orm import sessionmaker, scoped_session
        self.engine = create_engine(self.db_config.constr, pool_recycle=3600)
        self.session = scoped_session(sessionmaker(bind=self.engine))

        # Make sure tables are created
        DB_Base.metadata.create_all(self.engine)

    def init_workers(self):
        """
        Initialize worker threads
        :return:
        """
        logger.info('Starting %d working threads' % self.threads)
        for idx in range(self.threads):
            t = threading.Thread(target=self.work_thread_main, args=(idx, ))
            self.worker_threads.append(t)

        # Kick-off all threads
        for t in self.worker_threads:
            t.start()

        logger.info('Worker threads started')

    def cli(self):
        """
        CLI thread
        :return:
        """
        logger.info('CLI thread started')
        self.cmdloop()
        logger.info('Terminating CLI thread')

    def _init_queue(self):
        """
        Initializes link queue
        :return: 
        """
        sess = self.session()
        logger.debug('Loading users...')

        users_cnt = 0
        all_keys = sess.query(GitHubKey).filter(
            GitHubKey.is_interesting == 1).all()
        logger.debug('All users loaded')

        for rec in all_keys:
            users_cnt += 1

            job = DownloadJob(url=self.USER_DETAIL_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_ORGS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_ORG,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

            job = DownloadJob(url=self.USER_REPOS_URL % rec.key_user_found,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta={
                                  'user': rec.key_user_found,
                                  'user_id': rec.key_user_id_found
                              })
            self.link_queue.put(job)

        logger.info('Queue initialized, users cnt: %s' % users_cnt)
        utils.silent_close(sess)

    def _init_queue_assignee(self):
        """
        Init queue with assignees repos
        :return: 
        """
        sess = self.session()
        logger.debug('Loading repos...')

        users_cnt = 0
        repos = sess.query(GitHubRepo).filter(
            GitHubRepo.user_repo == 0).filter(
                GitHubRepo.repo_stars >= 100).all()
        logger.debug('Repos loaded: %s' % len(repos))

        for rec in repos:
            users_cnt += 1

            job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL % rec.repo_name,
                              jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                              meta={
                                  'user': rec.owner_login,
                                  'user_id': rec.owner_id,
                                  'repo': rec.repo_name,
                                  'page': 1
                              })
            self.link_queue.put(job)

    def work(self):
        """
        Main thread work method
        :return:
        """
        # Interrupt signals
        signal.signal(signal.SIGINT, self.signal_handler)

        self.init_config()
        self.init_db()

        # Resume last state
        self.state_resume()

        # Monitor threads.
        self.state_thread = threading.Thread(target=self.state_main, args=())
        self.state_thread.start()

        # If there is no link to process - create from since.
        if self.link_queue.qsize() == 0:
            if self.args.assign_only:
                self._init_queue_assignee()
            else:
                self._init_queue()

        # Worker threads
        self.init_workers()

        logger.info('Main thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))

        # CLI thread
        cli_thread = threading.Thread(target=self.cli, args=())
        cli_thread.setDaemon(True)
        cli_thread.start()

        # Join on workers
        self.after_loop()
        logger.info('Terminating main thread')
        return None

    def after_loop(self, wait_for_state=True):
        """
        After work loop finishes
        :return:
        """
        logger.info('Waiting termination of slave threads')

        # Wait here for termination of all workers and monitors.
        try:
            for t in self.worker_threads:
                t.join()

            if wait_for_state:
                self.state_thread.join()
        except:
            logger.error('Exception during thread join')
            logger.error(traceback.format_exc())

        logger.info('All threads terminates, last state save')
        self.state_save()

    #
    # General link processing & queue management
    #

    def work_thread_main(self, idx):
        """
        Worker thread main loop
        :return:
        """
        self.local_data.idx = idx
        logger.info('Working thread %d started' % idx)

        while not self.terminate and not self.stop_event.is_set():
            self.interruptible_sleep_delta(0.1)

            # Get credential to process link with
            resource = self.resource_allocate()
            if resource is None:
                continue

            # We have resource, now get the job
            job = None
            try:
                job = self.link_queue.get(True, timeout=1.0)
            except Queue.Empty:
                self.resource_return(resource)
                continue

            # If job last fail is too recent - put again back to queue
            if time.time() - job.last_fail < 3.0:
                self.link_queue.put(
                    job
                )  # re-insert to the back of the queue for later processing
                self.resource_return(resource)
                continue

            # Job processing starts here - fetch data page with the resource.
            js_data = None
            try:
                self.local_data.job = job
                self.local_data.resource = resource
                js_data, headers, raw_response = self.load_page_local()

            except RateLimitHit as e:
                logger.error(
                    '[%d] Rate limit hit: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))
                continue

            except Exception as e:
                logger.error(
                    '[%d] Exception in processing job: %s, failcnt: %d, res: %s, exception: %s'
                    % (idx, job.url, job.fail_cnt, resource.usr, e))

                self.on_job_failed(job)
                continue

            finally:
                self.resource_return(resource)
                self.local_data.resource = None
                self.local_data.last_usr = resource.usr
                self.local_data.last_remaining = resource.remaining

            # Process downloaded data here.
            try:
                if js_data is None:
                    self.audit_log('404', job.url, jtype=job.type, job=job)
                    self.flush_audit()
                    continue

                self.process_downloaded(job, js_data, headers, raw_response)

            except Exception as e:
                logger.error(
                    '[%d] Unexpected exception, processing type %s, link %s: cnt: %d, res: %s, %s'
                    % (idx, job.type, job.url, job.fail_cnt, resource.usr, e))

                traceback.print_exc()
                self.on_job_failed(job)
            finally:
                self.local_data.resource = None
                self.local_data.job = None
                self.local_data.last_usr = None
                self.local_data.last_remaining = None
                resource = None
                job = None
                headers = None
                raw_response = None

        pass
        logger.info('Terminating worker thread %d' % idx)

    def on_job_failed(self, job):
        """
        If job failed, this teaches it how to behave
        :param job:
        :return:
        """
        job.fail_cnt += 1
        job.last_fail = time.time()

        # if failed too many times - log and discard.
        if job.fail_cnt > 35:
            logger.warning('Job failed too many times %s' % job.url)
            self.audit_log('too-many-fails', job.url, jtype=job.type, job=job)
            self.flush_audit()
        else:
            self.link_queue.put(
                job)  # re-insert to the queue for later processing

    def load_page_local(self):
        """
        Loads page stored in thread local
        :return:
        """

        auth = None
        resource = self.local_data.resource
        if resource.usr is not None:
            auth = HTTPBasicAuth(resource.usr, resource.token)

        job = self.local_data.job

        res = requests.get(job.url, timeout=10, auth=auth)
        headers = res.headers

        resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset'))
        resource.remaining = utils.try_int(
            headers.get('X-RateLimit-Remaining'))
        resource.last_used = time.time()
        resource.used_cnt += 1

        if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10:
            resource.fail_cnt += 1
            raise RateLimitHit

        if res.status_code == 404:
            resource.fail_cnt += 1
            logger.warning('URL not found: %s' % job.url)
            return None, None, None

        if res.status_code // 100 != 2:
            resource.fail_cnt += 1
            res.raise_for_status()

        data = res.content
        if data is None:
            resource.fail_cnt += 1
            raise Exception('Empty response')

        js = json.loads(data, object_pairs_hook=OrderedDict)
        return js, headers, res

    #
    # Crawl specific methods
    #

    def process_downloaded(self, job, js_data, headers, raw_response):
        """
        Process downloaded page
        :param self: 
        :param job: 
        :param js_data: 
        :param headers: 
        :param raw_response: 
        :return: 
        """
        if job.type == DownloadJob.TYPE_USER:
            self.process_user(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_ORG:
            self.process_org(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_REPOS_USER:
            self.process_repo(job, js_data, headers, raw_response, True)
        elif job.type == DownloadJob.TYPE_REPOS_ORG:
            self.process_repo(job, js_data, headers, raw_response, False)
        elif job.type == DownloadJob.TYPE_REPO_COLAB:
            self.process_colab(job, js_data, headers, raw_response)
        elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE:
            self.process_assignee(job, js_data, headers, raw_response)
        else:
            logger.error('Unrecognized type %s' % job.type)

    def process_user(self, job, js, headers, raw_response):
        """
        Process user detail data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        if 'id' not in js:
            logger.error('Field ID not found in user')
            return

        s = self.session()
        try:
            user_id = int(js['id'])
            dbu = s.query(GitHubUserDetails).filter(
                GitHubUserDetails.id == user_id).one_or_none()
            is_new = False

            if dbu is None:
                is_new = True
                dbu = GitHubUserDetails()
                dbu.id = user_id

            dbu.date_last_check = salch.func.now()
            dbu.username = js['login']
            dbu.name = utils.utf8ize(utils.defvalkey(js, 'name'))

            dbu.company = utils.utf8ize(utils.defvalkey(js, 'company'))
            dbu.blog = utils.defvalkey(js, 'blog')
            dbu.email = utils.defvalkey(js, 'email')
            dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio'))
            dbu.usr_type = utils.defvalkey(js, 'type')

            dbu.public_repos = js['public_repos']
            dbu.public_gists = js['public_gists']
            dbu.followers = js['followers']
            dbu.following = js['following']

            dbu.created_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'created_at')))
            dbu.updated_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at')))

            if is_new:
                s.add(dbu)
            else:
                s.merge(dbu)
            s.commit()
            s.flush()
            s.expunge_all()

        except Exception as e:
            logger.error('Exception storing user details: %s: %s' %
                         (js['id'], e))
            logger.debug(traceback.format_exc())

        finally:
            utils.silent_close(s)

    def process_org(self, job, js, headers, raw_response):
        """
        Process user -> orgs data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        new_orgs = []
        for org in js:
            if 'id' not in org:
                logger.error('Field ID not found in orgs')
                continue

            s = self.session()
            try:
                org_id = int(org['id'])

                # delete first - avoid excs
                s.query(GitHubUserOrgs)\
                    .filter(GitHubUserOrgs.org_id == org_id)\
                    .filter(GitHubUserOrgs.username == job.meta['user'])\
                    .delete()

                dbu = GitHubUserOrgs()
                dbu.username = job.meta['user']
                dbu.org_id = org['id']
                dbu.org_name = org['login']
                dbu.org_desc = utils.utf8ize(org['description'])
                new_orgs.append(org['login'])

                s.add(dbu)

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing user->org details: %s: %s' %
                             (org['id'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' %
                                                             (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_ORG,
                          meta=new_meta)
        self.link_queue.put(job)

        # Load repositories for new organisations
        not_loaded_orgs = None
        with self.orgs_loaded_lock:
            new_orgs_set = set(new_orgs)
            not_loaded_orgs = new_orgs_set - self.orgs_loaded_set
            for x in new_orgs:
                self.orgs_loaded_set.add(x)

        for x in not_loaded_orgs:
            new_meta = dict(job.meta)
            new_meta['page'] = 1
            new_meta['org'] = x
            job = DownloadJob(url=self.ORG_REPOS_URL % x,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)
            self.link_queue.put(job)

    def process_repo(self, job, js, headers, raw_response, from_user):
        """
        Process repo list page
        :param job: 
        :param js: 
        :param headers: 
        :param raw_response: 
        :param from_user: 
        :return: 
        """
        for repo in js:
            if 'id' not in repo:
                logger.error('Field ID not found in repos')
                continue

            s = self.session()
            try:
                repo_id = int(repo['id'])
                dbe = s.query(GitHubRepo).filter(
                    GitHubRepo.id == repo_id).one_or_none()

                dbu = GitHubRepo()
                dbu.id = repo_id
                dbu.user_repo = from_user
                if from_user:
                    dbu.username = job.meta['user']
                else:
                    dbu.org_name = job.meta['org']

                if 'owner' in repo:
                    dbu.owner_id = repo['owner']['id']
                    dbu.owner_login = repo['owner']['login']

                dbu.repo_name = repo['full_name']
                dbu.repo_stars = repo['stargazers_count']
                dbu.repo_forks = repo['forks']
                dbu.repo_watchers = repo['watchers']
                dbu.repo_is_fork = repo['fork']
                dbu.repo_size = repo['size']
                dbu.repo_homepage = utils.defvalkey(repo, 'homepage')
                dbu.repo_language = utils.defvalkey(repo, 'language')
                dbu.created_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'created_at')))
                dbu.updated_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'updated_at')))
                dbu.pushed_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'pushed_at')))

                dbu.repo_description = utils.utf8ize(repo['description'])

                dbu.repo_stargazers_url = repo['stargazers_url']
                dbu.repo_forks_url = repo['forks_url']

                if not from_user and repo['stargazers_count'] > 100:
                    new_meta = dict(job.meta)
                    new_meta['page'] = 1
                    new_meta['repo'] = repo['full_name']
                    new_meta['owner'] = repo['owner']['login']

                    # Colab fetch - skip, no auth
                    job = DownloadJob(url=self.ORG_REPO_COLAB_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_COLAB,
                                      meta=new_meta)

                    # Asignee fetch
                    job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                                      meta=dict(new_meta))

                    self.link_queue.put(job)

                # DB save
                if dbe is None:
                    s.add(dbu)

                else:
                    if dbe.username != dbu.username:
                        logger.warning('Username does not match for %s %s %s' %
                                       (repo_id, dbe.username, dbu.username))
                    if dbe.org_name != dbu.org_name:
                        logger.warning('org_name does not match for %s %s %s' %
                                       (repo_id, dbe.org_name, dbu.org_name))
                    if dbe.owner_login != dbu.owner_login:
                        logger.warning(
                            'owner_login does not match for %s %s %s' %
                            (repo_id, dbe.owner_login, dbu.owner_login))

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error(
                    'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s'
                    % (repo['id'], repo['full_name'], json.dumps(
                        job.meta), job.url, e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        if from_user:
            new_url = (self.USER_REPOS_URL %
                       job.meta['user']) + ('?page=%s' % (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta=new_meta)
        else:
            new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' %
                                                                (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)

        self.link_queue.put(job)

    def process_colab(self, job, js, headers, raw_response):
        """
        Process colaborators for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for colab in js:
            if 'id' not in colab:
                logger.error('Field ID not found in colab')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoColab)\
                    .filter(GitHubRepoColab.user_name == colab['login'])\
                    .filter(GitHubRepoColab.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoColab()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = colab['login']
                dbu.can_pull = colab['permissions']['pull']
                dbu.can_push = colab['permissions']['push']
                dbu.can_admin = colab['permissions']['admin']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing colab details: %s:%s: %s' %
                             (colab['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_COLAB_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_COLAB,
                          meta=new_meta)
        self.link_queue.put(job)

    def process_assignee(self, job, js, headers, raw_response):
        """
        Process assignees for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for assignee in js:
            if 'id' not in assignee:
                logger.error('Field ID not found in assignees')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoAssignee)\
                    .filter(GitHubRepoAssignee.user_name == assignee['login'])\
                    .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoAssignee()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = assignee['login']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing cassignee details: %s:%s: %s' %
                             (assignee['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_ASSIGNEES_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                          meta=new_meta)
        self.link_queue.put(job)

    #
    # Resource management
    #

    def resource_allocate(self, blocking=True, timeout=1.0):
        """
        Takes resource from the pool.
        If the resource has low remaining credit, thread is suspended to re-charge.
        :return: resource or None if not available in the time
        """
        try:
            resource = self.resources_queue.get(True, timeout=1.0)
            if resource.remaining is not None and resource.remaining <= self.threads + 2:
                sleep_sec = resource.reset_time - time.time(
                ) if resource.reset_time is not None else 15 * 60
                sleep_sec += 120  # extra 2 minutes to avoid problems with resources

                logger.info(
                    'Rate limit exceeded on resource %s, remaining: %s, sleeping till: %s, it is %d seconds, '
                    '%d minutes' %
                    (resource.usr, resource.remaining, resource.reset_time,
                     sleep_sec, sleep_sec / 60.0))
                self.sleep_interruptible(time.time() + sleep_sec)
                logger.info('Resource sleep finished %s' % resource.usr)

                # Reset estimations, needs to be refreshed
                resource.remaining = None
                resource.reset_time = None

            return resource

        except Queue.Empty:
            return None

    def resource_return(self, res):
        """
        Returns resource to the pool
        :param res:
        :return:
        """
        self.resources_queue.put(res)

    def sleep_interruptible(self, until_time):
        """
        Interruptible sleep - sleep until given time.
        :param until_time:
        :return:
        """
        while time.time() <= until_time:
            time.sleep(1.0)
            if self.terminate or self.stop_event.is_set():
                return

    def interruptible_sleep_delta(self, sleep_time):
        """
        Sleeps the current thread for given amount of seconds, stop event terminates the sleep - to exit the thread.
        :param sleep_time:
        :return:
        """
        if sleep_time is None:
            return

        sleep_time = float(sleep_time)

        if sleep_time == 0:
            return

        sleep_start = time.time()
        while not self.stop_event.is_set() and not self.terminate:
            time.sleep(0.1)
            if time.time() - sleep_start >= sleep_time:
                return

    #
    # State
    #

    def flush_state(self):
        """
        Flushes state/config to the state file
        :return:
        """
        self.state['rate_limit_remaining'] = self.rate_limit_remaining
        self.state['rate_limit_reset'] = self.rate_limit_reset
        utils.flush_json(self.state, self.state_file_path)

    #
    # Auditing - errors, problems for further analysis
    #

    def audit_log(self, evt=None, link=None, jtype=None, job=None):
        """
        Appends audit log to the buffer. Lock protected.
        :param evt:
        :param link:
        :return:
        """
        log = collections.OrderedDict()
        log['time'] = time.time()
        log['evt'] = evt
        log['jtype'] = jtype
        log['link'] = link

        if job is not None and isinstance(job, DownloadJob):
            log['job'] = job.to_json()

        with self.audit_lock:
            self.audit_records_buffered.append(log)

    def flush_audit(self):
        """
        Flushes audit logs to the JSON append only file.
        Routine protected by the lock (no new audit record can be inserted while holding the lock)
        :return:
        """
        if self.audit_file is None:
            self.audit_records_buffered = []
            return

        self.audit_lock.acquire()
        try:
            if len(self.audit_records_buffered) == 0:
                return
            with open(self.audit_file, 'a') as fa:
                for x in self.audit_records_buffered:
                    fa.write(json.dumps(x, cls=utils.AutoJSONEncoder) + "\n")
            self.audit_records_buffered = []
        except Exception as e:
            logger.error('Exception in audit log dump %s' % e)
        finally:
            self.audit_lock.release()

    #
    # State save / resume
    #

    def state_main(self):
        """
        State thread - periodical dump of the queues.
        :return:
        """
        logger.info('State thread started %s %s %s' %
                    (os.getpid(), os.getppid(), threading.current_thread()))
        try:
            while not self.stop_event.is_set() and not self.terminate:
                # Dump stats each x seconds
                # Sleep is here because of dumping the state for the last time just before program quits.
                self.interruptible_sleep_delta(2)
                self.state_save()

                # Check memory conditions
                self.state_ram_check()

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s' % e)

        finally:
            pass

        logger.info('State loop terminated')

    def state_ram_check(self):
        """
        Checks memory terminating conditions
        :return:
        """

        if self.max_mem is None:
            return

        cur_ram = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        if cur_ram <= self.max_mem:
            return

        logger.warning(
            'Maximum memory threshold reached: %s kB = %s MB, threshold = %s kB'
            % (cur_ram, cur_ram / 1024.0, self.max_mem))
        self.trigger_stop()

    def state_gen(self):
        """
        Dumps state
        :return:
        """
        def link_char(job):
            if job.type == DownloadJob.TYPE_USER:
                return 'U'
            elif job.type == DownloadJob.TYPE_ORG:
                return 'o'
            elif job.type == DownloadJob.TYPE_REPOS_USER:
                return '.'
            elif job.type == DownloadJob.TYPE_REPOS_ORG:
                return ','
            elif job.type == DownloadJob.TYPE_REPO_COLAB:
                return 'c'
            elif job.type == DownloadJob.TYPE_REPO_ASSIGNEE:
                return '-'
            else:
                return '!'

        try:
            js_q = collections.OrderedDict()
            js_q['gen'] = time.time()
            js_q['link_size'] = self.link_queue.qsize()
            js_q['memory'] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

            # Dequeues
            self.new_users_events.maintain()
            self.new_keys_events.maintain()

            users_in_5min = self.new_users_events.under_limit(5 * 60)
            keys_in_5min = self.new_keys_events.under_limit(5 * 60)

            js_q['users_dequeue_size'] = self.new_users_events.len()
            js_q['keys_dequeue_size'] = self.new_keys_events.len()
            js_q['users_5min'] = users_in_5min
            js_q['keys_5min'] = keys_in_5min
            js_q['users_1min'] = users_in_5min / 5.0
            js_q['keys_1min'] = keys_in_5min / 5.0

            # link queue structure
            qdata = list(self.link_queue.queue)
            qdata.sort(cmp=DownloadJob.cmp)
            js_q['link_structure'] = ''.join([link_char(x) for x in qdata])

            # Stats.
            js_q['resource_stats'] = [
                x.to_json() for x in list(self.resources_list)
            ]

            # Finally - the queue
            js_q['link_queue'] = [x.to_json() for x in qdata]
            return js_q

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s', e)

    def state_save(self):
        """
        saves the state
        :return:
        """
        try:
            js_q = self.state_gen()
            utils.flush_json(js_q, self.state_file_path)

        except Exception as e:
            traceback.print_exc()
            logger.error('Exception in state: %s', e)

    def state_resume(self):
        """
        Attempts to resume the queues from the monitoring files
        :return:
        """
        try:
            if self.state is None:
                return

            if 'link_queue' in self.state:
                for rec in self.state['link_queue']:
                    job = DownloadJob.from_json(rec)
                    self.link_queue.put(job)
                logger.info('Link queue resumed, entries: %d' %
                            len(self.state['link_queue']))

        except Exception as e:
            traceback.print_exc()
            logger.warning('Exception in resuming the state %s' % e)
            logger.error('State resume failed, exiting')
            sys.exit(1)

    def main(self):
        """
        Main entry point
        :return: 
        """
        args_src = sys.argv
        parser = argparse.ArgumentParser(
            description='Downloads GitHub User info')
        parser.add_argument('-c',
                            dest='config',
                            default=None,
                            help='JSON config file')
        parser.add_argument('-s',
                            dest='status',
                            default=None,
                            help='JSON status file')
        parser.add_argument('-t',
                            dest='threads',
                            default=1,
                            type=int,
                            help='Number of download threads to use')
        parser.add_argument(
            '--max-mem',
            dest='max_mem',
            default=None,
            type=int,
            help='Maximal memory threshold in kB when program terminates itself'
        )
        parser.add_argument('--assign-only',
                            dest='assign_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='fetch assignees')

        args = self.args = parser.parse_args(args=args_src[1:])

        self.threads = args.threads
        self.max_mem = args.max_mem

        self.config_file = args.config
        self.audit_file = os.path.join(os.getcwd(), 'audit.json')
        self.state_file_path = args.status if args.status is not None else os.path.join(
            os.getcwd(), 'state.json')

        if os.path.exists(self.state_file_path):
            utils.file_backup(self.state_file_path, backup_dir='.')

        if os.path.exists('.github-quit'):
            os.remove('.github-quit')

        sys.argv = [args_src[0]]
        logger.info('GitHub loader started, args: %s' % args)

        self.work()
        sys.argv = args_src