Esempio n. 1
0
 def from_json(cls, js):
     tj = cls()
     tj.url = js['url']
     tj.type = js['type']
     tj.fail_cnt = js['fail_cnt']
     tj.last_fail = js['last_fail']
     tj.priority = utils.defvalkey(js, 'priority', 0)
     tj.time_added = utils.defvalkey(js, 'time_added', 0)
     tj.meta = utils.defvalkey(js, 'meta', None)
     if 'user_id' in js:
         user_url = js['user_url'] if 'user_url' in js else None
         # tj.user = GitHubUser(user_id=js['user_id'], user_name=js['user_name'], user_type=js['user_type'], user_url=user_url)
     return tj
Esempio n. 2
0
    def process_record(self, idx, record):
        """
        Current record analysis

        :param idx: 
        :param record: 
        :return: 
        """
        record['id'] = self.ctr
        self.last_record_seen = record
        raw = utils.defvalkey(record, 'raw')
        parsed = utils.defvalkey(record, 'parsed')

        # Process server cert
        if parsed is None:
            self.not_parsed += 1  # TODO: parse raw cert if needed
            return

        try:
            ret = collections.OrderedDict()
            if 'rsa_public_key' not in parsed['subject_key_info']:
                self.not_rsa += 1
                return

            mod16 = base64.b16encode(
                base64.b64decode(
                    parsed['subject_key_info']['rsa_public_key']['modulus']))
            if not self.fmagic.test16(mod16):
                return

            self.num_found += 1
            ret['id'] = self.ctr
            ret['fprint256'] = utils.defvalkey(parsed, 'fingerprint_sha256')
            self.fill_cn_src(ret, parsed)
            self.fill_rsa_ne(ret, parsed)
            self.fill_cert_info(ret, parsed, record)

            if raw is not None:
                rawb = base64.b64decode(raw)
                ret['fprint'] = hashlib.sha1(rawb).hexdigest()
            ret['raw'] = raw

            self.last_record_flushed = record
            if not self.is_dry():
                self.file_leafs_fh.write(json.dumps(ret) + '\n')

        except Exception as e:
            logger.warning('Certificate processing error %s : %s' %
                           (self.ctr, e))
            self.trace_logger.log(e)
            self.not_cert_ok += 1
Esempio n. 3
0
    def process_user(self, job, js, headers, raw_response):
        """
        Process user detail data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        if 'id' not in js:
            logger.error('Field ID not found in user')
            return

        s = self.session()
        try:
            user_id = int(js['id'])
            dbu = s.query(GitHubUserDetails).filter(
                GitHubUserDetails.id == user_id).one_or_none()
            is_new = False

            if dbu is None:
                is_new = True
                dbu = GitHubUserDetails()
                dbu.id = user_id

            dbu.date_last_check = salch.func.now()
            dbu.username = js['login']
            dbu.name = utils.utf8ize(utils.defvalkey(js, 'name'))

            dbu.company = utils.utf8ize(utils.defvalkey(js, 'company'))
            dbu.blog = utils.defvalkey(js, 'blog')
            dbu.email = utils.defvalkey(js, 'email')
            dbu.bio = utils.utf8ize(utils.defvalkey(js, 'bio'))
            dbu.usr_type = utils.defvalkey(js, 'type')

            dbu.public_repos = js['public_repos']
            dbu.public_gists = js['public_gists']
            dbu.followers = js['followers']
            dbu.following = js['following']

            dbu.created_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'created_at')))
            dbu.updated_at = utils.dt_norm(
                utils.try_parse_timestamp(utils.defvalkey(js, 'updated_at')))

            if is_new:
                s.add(dbu)
            else:
                s.merge(dbu)
            s.commit()
            s.flush()
            s.expunge_all()

        except Exception as e:
            logger.error('Exception storing user details: %s: %s' %
                         (js['id'], e))
            logger.debug(traceback.format_exc())

        finally:
            utils.silent_close(s)
Esempio n. 4
0
    def process_roots(self, idx, record, server_cert):
        """
        Process root certificates
        :param idx: 
        :param record: 
        :param server_cert: 
        :return: 
        """
        chains_ctr = []
        try:
            if 'chain' not in server_cert:
                return chains_ctr

            for cert in server_cert['chain']:
                self.chain_ctr += 1
                if 'parsed' not in cert:
                    continue

                parsed = cert['parsed']
                fprint = parsed['fingerprint_sha256']
                if fprint in self.chain_cert_db:
                    chains_ctr.append(self.chain_cert_db[fprint])
                    continue

                ret = collections.OrderedDict()
                is_rsa = parsed['subject_key_info']['key_algorithm'][
                    'name'].lower() == 'rsa'
                if not is_rsa:
                    self.not_rsa += 1

                ret['id'] = self.chain_ctr
                ret['count'] = 1
                ret['chain'] = 1
                ret['valid'] = utils.defvalkeys(parsed, ['signature', 'valid'])
                ret['ssign'] = utils.defvalkeys(parsed,
                                                ['signature', 'self_signed'])
                ret['fprint'] = fprint
                ret['fprint1'] = utils.defvalkey(parsed, 'fingerprint_sha1')
                self.fill_cn_src(ret, parsed)
                if is_rsa:
                    self.fill_rsa_ne(ret, parsed)
                ret['raw'] = cert['raw']

                if not self.is_dry():
                    self.file_roots_fh.write(json.dumps(ret) + '\n')

                self.chain_cert_db[fprint] = self.chain_ctr
                chains_ctr.append(self.chain_ctr)

        except Exception as e:
            logger.warning('Chain processing error %s : %s' %
                           (self.chain_ctr, e))
            logger.debug(traceback.format_exc())
            self.not_chain_ok += 1

        return chains_ctr
Esempio n. 5
0
    def process_colab(self, job, js, headers, raw_response):
        """
        Process colaborators for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for colab in js:
            if 'id' not in colab:
                logger.error('Field ID not found in colab')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoColab)\
                    .filter(GitHubRepoColab.user_name == colab['login'])\
                    .filter(GitHubRepoColab.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoColab()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = colab['login']
                dbu.can_pull = colab['permissions']['pull']
                dbu.can_push = colab['permissions']['push']
                dbu.can_admin = colab['permissions']['admin']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing colab details: %s:%s: %s' %
                             (colab['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_COLAB_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_COLAB,
                          meta=new_meta)
        self.link_queue.put(job)
Esempio n. 6
0
    def store_users_list(self, users):
        """
        Stores all user in the list
        :param users
        :return:
        """
        # Handling gaps in the user space ID. With user-only optimization it causes
        # overlaps.
        reduced_by = 0
        with self.processed_user_set_lock:
            ids = [user.user_id for user in users]
            ids_ok = []
            for id in ids:
                if id in self.processed_user_set:
                    reduced_by += 1
                    continue
                self.processed_user_set.add(id)
                ids_ok.append(id)
            users = [user for user in users if user.user_id in ids_ok]

        # Bulk user load
        s = self.session()
        id_list = sorted([user.user_id for user in users])
        db_users = s.query(GitHubUserDb).filter(
            GitHubUserDb.id.in_(id_list)).all()
        db_user_map = {user.id: user for user in db_users}

        for user in users:
            self.new_users_events.insert()

            # Store user to the DB
            try:
                db_user = utils.defvalkey(db_user_map, key=user.user_id)
                self.store_user(user, s, db_user=db_user, db_user_loaded=True)

            except Exception as e:
                logger.warning('[%02d] Exception in storing user %s' %
                               (self.local_data.idx, e))
                self.trace_logger.log(e)
                logger.info('[%02d] idlist: %s' %
                            (self.local_data.idx, id_list))
                self.trigger_quit()
                break

        try:
            s.commit()
            # logger.info('[%02d] Commited, reduced by: %s' % (self.local_data.idx, reduced_by))
        except Exception as e:
            logger.warning('[%02d] Exception in storing bulk users' %
                           self.local_data.idx)
            logger.warning(traceback.format_exc())
            logger.info('[%02d] idlist: %s' % (self.local_data.idx, id_list))
            self.trigger_quit()
        finally:
            utils.silent_close(s)
Esempio n. 7
0
    def process_assignee(self, job, js, headers, raw_response):
        """
        Process assignees for org owned repos
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        for assignee in js:
            if 'id' not in assignee:
                logger.error('Field ID not found in assignees')
                continue

            s = self.session()
            try:
                # delete first - avoid excs
                s.query(GitHubRepoAssignee)\
                    .filter(GitHubRepoAssignee.user_name == assignee['login'])\
                    .filter(GitHubRepoAssignee.repo_name == job.meta['repo'])\
                    .delete()

                dbu = GitHubRepoAssignee()
                dbu.repo_name = job.meta['repo']
                dbu.user_name = assignee['login']

                s.add(dbu)
                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing cassignee details: %s:%s: %s' %
                             (assignee['login'], job.meta['repo'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.ORG_REPO_ASSIGNEES_URL %
                   (job.meta['repo'])) + ('?page=%s' % (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                          meta=new_meta)
        self.link_queue.put(job)
Esempio n. 8
0
    def fill_cert_info(self, ret, parsed, rec):
        """
        isCA and other.
        :param ret: 
        :param parsed: 
        :param rec: 
        :return: 
        """
        ret['ca'] = utils.defvalkeys(
            parsed, ['extensions', 'basic_constraints', 'is_ca'])
        issuer = utils.defvalkey(parsed, 'issuer')
        subject = utils.defvalkey(parsed, 'subject')
        ret['ss'] = issuer == subject
        ret['subject_dn'] = utils.defvalkey(parsed, 'subject_dn')
        ret['issuer_dn'] = utils.defvalkey(parsed, 'issuer_dn')
        ret['parents'] = utils.defvalkey(rec, 'parents')

        ret['crt_src'] = utils.defvalkey(rec, 'source')
        ret['seen_in_scan'] = utils.defvalkey(rec, 'seen_in_scan')
        ret['valid_nss'] = utils.defvalkey(rec, 'valid_nss')
        ret['was_valid_nss'] = utils.defvalkey(rec, 'was_valid_nss')
        ret['current_valid_nss'] = utils.defvalkey(rec, 'current_valid_nss')
Esempio n. 9
0
    def process_record(self, idx, line):
        """
        Processes one record from PGP dump
        :param idx: 
        :param line: 
        :return: 
        """
        rec = json.loads(line)
        master_key_id = int(utils.defvalkey(rec, 'key_id', '0'), 16)
        master_fingerprint = utils.defvalkey(rec, 'fingerprint')

        flat_keys = [rec]
        user_names = []

        # Phase 1 - info extraction
        if 'packets' in rec:
            for packet in rec['packets']:
                if packet['tag_name'] == 'User ID':
                    utils.append_not_none(user_names,
                                          utils.defvalkey(packet, 'user_id'))
                elif packet['tag_name'] == 'Public-Subkey':
                    flat_keys.append(packet)

        # Test all keys
        self.test_flat_keys(flat_keys, user_names, master_key_id,
                            master_fingerprint, rec)

        if time.time() - self.last_report > self.report_time:
            per_second = (idx - self.last_report_idx) / float(self.report_time)
            logger.debug(
                ' .. report idx: %s, per second: %2.2f, found: %s, '
                'num_master: %s, num_sub: %s, ratio: %s, cur key: %016X ' %
                (idx, per_second, self.found, self.num_master_keys,
                 self.num_sub_keys, float(self.num_sub_keys) /
                 self.num_master_keys, master_key_id))

            self.last_report = time.time()
            self.last_report_idx = idx
Esempio n. 10
0
    def key_exp(self, rec=None):
        """
        Returns exponent from the record
        :param rec: 
        :return: 
        """
        if rec is None:
            return False

        n = utils.defvalkey(rec, 'e')
        if n is None:
            return False

        n = n.strip()
        n = utils.strip_hex_prefix(n)
        return int(n, 16)
Esempio n. 11
0
    def test_key(self, rec=None):
        """
        Fingerprint test
        :param rec: 
        :return: 
        """
        if rec is None:
            return False

        n = utils.defvalkey(rec, 'n')
        if n is None:
            return False

        n = n.strip()
        n = utils.strip_hex_prefix(n)

        x = self.fmagic.magic16([n])
        if len(x) > 0:
            return True
        return False
Esempio n. 12
0
    def test_flat_keys(self, flat_keys, user_names, master_key_id,
                       master_fingerprint, rec):
        """
        Tests all keys in the array
        :param flat_keys: 
        :return: 
        """
        if flat_keys is None or len(flat_keys) == 0:
            return

        self.num_master_keys += 1
        self.num_sub_keys += len(flat_keys) - 1

        rsa_keys = ['n' in x and len(x['n']) > 0 for x in flat_keys]
        self.num_master_keys_rsa += rsa_keys[0]
        self.num_sub_keys_rsa += sum(rsa_keys[1:])
        self.key_counts[len(flat_keys)] += 1

        key_sizes = [self.key_size(x) for x in flat_keys]
        for x in key_sizes:
            self.key_sizes[x] += 1

        # benchmarking
        if self.args.bench:
            for rec in flat_keys:
                n = self.key_mod(rec)
                if n is None or n == 0:
                    continue

                self.bench_mods.append('%x' % n)

        # 1.11.2015 a 19.4.2017
        bnd_a = datetime.datetime(year=2015, month=11, day=1)
        bnd_b = datetime.datetime(year=2017,
                                  month=4,
                                  day=19,
                                  hour=23,
                                  minute=59,
                                  second=59)
        in_time = [
            'creation_time' in rec and utils.time_between(
                datetime.datetime.utcfromtimestamp(rec['creation_time']),
                bnd_a, bnd_b) for rec in flat_keys
        ]
        rsa_in_time = [
            'n' in rec and len(rec['n']) > 0 and in_time[idx]
            for idx, rec in enumerate(flat_keys)
        ]

        self.num_total_keys_date += sum(in_time)
        self.num_total_master_keys_date += in_time[0]
        self.num_rsa_keys_date += sum(rsa_in_time)
        self.num_rsa_master_keys_date += rsa_in_time[0]

        # key testing
        tested = [self.test_key(x) for x in flat_keys]

        # classification
        if self.classif_file is not None:
            for idx, rec in enumerate(flat_keys):
                if 'n' not in rec:
                    continue

                js = OrderedDict()
                ctime = datetime.datetime.utcfromtimestamp(rec['creation_time']).strftime('%Y-%m-%d') \
                    if 'creation_time' in rec else ''
                cname = user_names[0].encode('utf8').replace(
                    ';', '_') if len(user_names) > 0 else ''

                js['source'] = [cname, ctime]
                js['size'] = self.key_size(rec)
                js['msb'] = '0x%x' % self.key_msb(rec)
                js['sub'] = int(idx != 0)
                js['master_id'] = utils.format_pgp_key(master_key_id)
                js['sec'] = int(tested[idx])
                js['tot'] = len(flat_keys)
                js['e'] = '0x%x' % self.key_exp(rec)
                js['n'] = '0x%x' % self.key_mod(rec)
                self.classif_file.write('%s\n' % json.dumps(js))

        # Key detection and store
        if any(tested):
            flat_key_ids = [
                int(utils.defvalkey(x, 'key_id', '0'), 16) for x in flat_keys
            ]
            keys_hex = [utils.format_pgp_key(x) for x in flat_key_ids]
            det_key_ids = [
                x for _idx, x in enumerate(flat_key_ids) if tested[_idx]
            ]

            logger.info('------- interesting map: %s for key ids %s' %
                        (tested, keys_hex))

            js = OrderedDict()
            js['detection'] = tested
            js['key_ids'] = keys_hex
            js['names'] = user_names
            js['master_key_id'] = utils.format_pgp_key(master_key_id)
            js['master_key_fprint'] = master_fingerprint
            # js['pgp'] = rec

            self.dump_file.write(json.dumps(js) + '\n')
            self.dump_file.flush()

            self.found_no_master_key += not tested[0]
            self.found_master_key += tested[0]
            self.found_sub_key += sum(tested[1:])
            self.found += sum(tested)
            self.found_entities += 1
            self.found_entities_keynum += len(tested)
            self.found_master_not_rsa += not rsa_keys[0]
            self.found_key_counts[len(flat_keys)] += 1
            for x in det_key_ids:
                self.flat_key_ids.add(x)

            for idx, x in enumerate(key_sizes):
                if tested[idx]:
                    self.found_key_sizes[x] += 1

            for idx, x in enumerate(tested):
                if not tested[idx]:
                    continue

                # 2012-04-30; rsa_bit_length; subkey_yes_no; email; MSB(modulus); modulus;
                rec = flat_keys[idx]

                res = []
                res.append(
                    datetime.datetime.utcfromtimestamp(rec['creation_time']).
                    strftime('%Y-%m-%d') if 'creation_time' in rec else '')
                res.append(self.key_size(rec))
                res.append(int(idx == 0))
                res.append(user_names[0].encode('utf8').
                           replace(';', '_') if len(user_names) > 0 else '')
                res.append('%x' % self.key_msb(rec))
                res.append('%x' % self.key_mod(rec))
                self.found_info.append(res)
Esempio n. 13
0
    def continue_leafs(self, name):
        """
        Continues processing of the leafs.
        Finds the last record - returns this also.
        Truncates the rest of the file.
        :param name: 
        :return: last record loaded
        """
        fsize = os.path.getsize(name)
        pos = 0

        # If file is too big try to skip 10 MB before end
        if fsize > 1024 * 1024 * 1024 * 2:
            pos = fsize - 1024 * 1024 * 1024 * 1.5
            logger.info('Leafs file too big: %s, skipping to %s' %
                        (fsize, pos))

            self.file_leafs_fh.seek(pos)
            x = self.file_leafs_fh.next()  # skip unfinished record
            pos += len(x)

        record_from_state_found = False
        terminate_with_record = False
        last_record = None
        last_id_seen = None
        for line in self.file_leafs_fh:
            ln = len(line)
            try:
                last_record = json.loads(line)
                last_id_seen = last_record['id']
                self.state_loaded_ips.add(last_record['ip'])
                self.ctr = max(self.ctr, last_record['id'])
                pos += ln

                if self.last_record_flushed is not None and self.last_record_flushed[
                        'ip'] == last_record['ip']:
                    logger.info(
                        'Found last record flushed in data file, ip: %s' %
                        last_record['ip'])
                    record_from_state_found = True
                    break

            except Exception as e:
                terminate_with_record = True
                break

        logger.info('Operation resumed at leaf ctr: %s, last ip: %s' %
                    (self.ctr, utils.defvalkey(last_record, 'ip')))

        if self.last_record_flushed is not None and not record_from_state_found:
            logger.warning(
                'Could not find the record from the state in the data file. Some data may be missing.'
            )
            logger.info(
                'Last record from state id: %s, last record data file id: %s' %
                (self.last_record_resumed['id'], last_id_seen))
            raise ValueError('Incomplete data file')

        if terminate_with_record:
            logger.info('Leaf: Invalid record detected, position: %s' % pos)

            if not self.is_dry():
                self.file_leafs_fh.seek(pos)
                self.file_leafs_fh.truncate()
                self.file_leafs_fh.flush()

        return last_record
Esempio n. 14
0
    def process_record(self, idx, record):
        """
        Current record
        {"e":"0x10001","count":1,"source":["COMMON_NAME","NOT_BEFORE_2010-11-19"],
        "id":32000000,"cn":"COMMON_NAME","n":"0x...","timestamp":1475342704760}

        :param idx: 
        :param record: 
        :return: 
        """
        record['id'] = self.ctr

        ip = utils.defvalkey(record, 'ip')
        domain = utils.defvalkey(record, 'domain')
        timestamp_fmt = utils.defvalkey(record, 'timestamp')
        self.last_record_seen = record

        if not self.is_record_tls(record):
            self.not_tls += 1
            return

        server_cert = record['data']['tls']['server_certificates']
        if 'validation' not in server_cert or 'certificate' not in server_cert:
            self.not_cert_ok += 1
            return

        # Process chains anyway as we may be interested in them even though the server is not RSA
        chains_roots = self.process_roots(idx, record, server_cert)

        # Process server cert
        trusted = utils.defvalkey(server_cert['validation'], 'browser_trusted')
        matches = utils.defvalkey(server_cert['validation'], 'matches_domain')
        cert_obj = server_cert['certificate']

        if 'parsed' not in cert_obj:
            self.not_parsed += 1
            return

        parsed = cert_obj['parsed']
        try:
            ret = collections.OrderedDict()
            if parsed['subject_key_info']['key_algorithm']['name'].lower(
            ) != 'rsa':
                self.not_rsa += 1
                return

            ret['id'] = self.ctr
            ret['ip'] = ip
            ret['count'] = 1
            ret['fprint'] = utils.defvalkey(parsed, 'fingerprint_sha256')
            ret['fprint1'] = utils.defvalkey(parsed, 'fingerprint_sha1')
            utils.set_nonempty(ret, 'dom', domain)

            tstamp = utils.try_parse_timestamp(timestamp_fmt)
            ret['timestamp'] = utils.unix_time(tstamp)
            utils.set_nonempty(ret, 'trust', trusted)
            utils.set_nonempty(ret, 'match', matches)
            utils.set_nonempty(
                ret, 'valid', utils.defvalkeys(parsed, ['signature', 'valid']))
            utils.set_nonempty(
                ret, 'ssign',
                utils.defvalkeys(parsed, ['signature', 'self_signed']))

            self.fill_cn_src(ret, parsed)
            self.fill_rsa_ne(ret, parsed)
            ret['chains'] = chains_roots
            self.last_record_flushed = record

            if not self.is_dry():
                self.file_leafs_fh.write(json.dumps(ret) + '\n')

        except Exception as e:
            logger.warning('Certificate processing error %s : %s' %
                           (self.ctr, e))
            logger.debug(traceback.format_exc())
            self.not_cert_ok += 1
Esempio n. 15
0
    def process_repo(self, job, js, headers, raw_response, from_user):
        """
        Process repo list page
        :param job: 
        :param js: 
        :param headers: 
        :param raw_response: 
        :param from_user: 
        :return: 
        """
        for repo in js:
            if 'id' not in repo:
                logger.error('Field ID not found in repos')
                continue

            s = self.session()
            try:
                repo_id = int(repo['id'])
                dbe = s.query(GitHubRepo).filter(
                    GitHubRepo.id == repo_id).one_or_none()

                dbu = GitHubRepo()
                dbu.id = repo_id
                dbu.user_repo = from_user
                if from_user:
                    dbu.username = job.meta['user']
                else:
                    dbu.org_name = job.meta['org']

                if 'owner' in repo:
                    dbu.owner_id = repo['owner']['id']
                    dbu.owner_login = repo['owner']['login']

                dbu.repo_name = repo['full_name']
                dbu.repo_stars = repo['stargazers_count']
                dbu.repo_forks = repo['forks']
                dbu.repo_watchers = repo['watchers']
                dbu.repo_is_fork = repo['fork']
                dbu.repo_size = repo['size']
                dbu.repo_homepage = utils.defvalkey(repo, 'homepage')
                dbu.repo_language = utils.defvalkey(repo, 'language')
                dbu.created_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'created_at')))
                dbu.updated_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'updated_at')))
                dbu.pushed_at = utils.dt_norm(
                    utils.try_parse_timestamp(
                        utils.defvalkey(repo, 'pushed_at')))

                dbu.repo_description = utils.utf8ize(repo['description'])

                dbu.repo_stargazers_url = repo['stargazers_url']
                dbu.repo_forks_url = repo['forks_url']

                if not from_user and repo['stargazers_count'] > 100:
                    new_meta = dict(job.meta)
                    new_meta['page'] = 1
                    new_meta['repo'] = repo['full_name']
                    new_meta['owner'] = repo['owner']['login']

                    # Colab fetch - skip, no auth
                    job = DownloadJob(url=self.ORG_REPO_COLAB_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_COLAB,
                                      meta=new_meta)

                    # Asignee fetch
                    job = DownloadJob(url=self.ORG_REPO_ASSIGNEES_URL %
                                      (repo['full_name']),
                                      jtype=DownloadJob.TYPE_REPO_ASSIGNEE,
                                      meta=dict(new_meta))

                    self.link_queue.put(job)

                # DB save
                if dbe is None:
                    s.add(dbu)

                else:
                    if dbe.username != dbu.username:
                        logger.warning('Username does not match for %s %s %s' %
                                       (repo_id, dbe.username, dbu.username))
                    if dbe.org_name != dbu.org_name:
                        logger.warning('org_name does not match for %s %s %s' %
                                       (repo_id, dbe.org_name, dbu.org_name))
                    if dbe.owner_login != dbu.owner_login:
                        logger.warning(
                            'owner_login does not match for %s %s %s' %
                            (repo_id, dbe.owner_login, dbu.owner_login))

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error(
                    'Exception storing repo details: %s:%s meta: %s, url: %s, exc: %s'
                    % (repo['id'], repo['full_name'], json.dumps(
                        job.meta), job.url, e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        if from_user:
            new_url = (self.USER_REPOS_URL %
                       job.meta['user']) + ('?page=%s' % (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_USER,
                              meta=new_meta)
        else:
            new_url = (self.ORG_REPOS_URL % job.meta['org']) + ('?page=%s' %
                                                                (cur_page + 1))
            job = DownloadJob(url=new_url,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)

        self.link_queue.put(job)
Esempio n. 16
0
    def process_org(self, job, js, headers, raw_response):
        """
        Process user -> orgs data
        :param job:
        :param js:
        :param headers:
        :param raw_response:
        :return:
        """
        new_orgs = []
        for org in js:
            if 'id' not in org:
                logger.error('Field ID not found in orgs')
                continue

            s = self.session()
            try:
                org_id = int(org['id'])

                # delete first - avoid excs
                s.query(GitHubUserOrgs)\
                    .filter(GitHubUserOrgs.org_id == org_id)\
                    .filter(GitHubUserOrgs.username == job.meta['user'])\
                    .delete()

                dbu = GitHubUserOrgs()
                dbu.username = job.meta['user']
                dbu.org_id = org['id']
                dbu.org_name = org['login']
                dbu.org_desc = utils.utf8ize(org['description'])
                new_orgs.append(org['login'])

                s.add(dbu)

                s.commit()
                s.flush()
                s.expunge_all()

            except Exception as e:
                logger.error('Exception storing user->org details: %s: %s' %
                             (org['id'], e))
                logger.debug(traceback.format_exc())

            finally:
                utils.silent_close(s)

        if len(js) == 0:
            return

        # Load next page
        cur_page = utils.defvalkey(job.meta, 'page', 1)
        new_url = (self.USER_ORGS_URL % job.meta['user']) + ('?page=%s' %
                                                             (cur_page + 1))
        new_meta = dict(job.meta)
        new_meta['page'] = cur_page + 1

        job = DownloadJob(url=new_url,
                          jtype=DownloadJob.TYPE_ORG,
                          meta=new_meta)
        self.link_queue.put(job)

        # Load repositories for new organisations
        not_loaded_orgs = None
        with self.orgs_loaded_lock:
            new_orgs_set = set(new_orgs)
            not_loaded_orgs = new_orgs_set - self.orgs_loaded_set
            for x in new_orgs:
                self.orgs_loaded_set.add(x)

        for x in not_loaded_orgs:
            new_meta = dict(job.meta)
            new_meta['page'] = 1
            new_meta['org'] = x
            job = DownloadJob(url=self.ORG_REPOS_URL % x,
                              jtype=DownloadJob.TYPE_REPOS_ORG,
                              meta=new_meta)
            self.link_queue.put(job)