def _retrieve_member(uri, member_id): content = utils.read_uri(uri) if not content: return {} member = {} for rec in re.finditer(NAME_AND_DATE_PATTERN, content): result = rec.groupdict() member["member_id"] = member_id member["member_name"] = result["member_name"] member["date_joined"] = result["date_joined"] member["member_uri"] = uri break member["company_draft"] = "*independent" for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() member["company_draft"] = result["company_draft"] return member
def process_official_list(releases): module_groups = _make_default_module_groups() releases_with_refs = (r for r in releases if r.get('refs')) for release in releases_with_refs: ref_governance = release['refs'].get('governance') if not ref_governance: continue gov_type = ref_governance['type'] gov_source = ref_governance['source'] release_name = release['release_name'].lower() LOG.debug('Process governance content from uri: %s', gov_source) content = yaml.safe_load(utils.read_uri(gov_source)) GOVERNANCE_PROCESSORS[gov_type](module_groups, release_name, content) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id group['modules'].sort() if 'releases' in group: for gr in six.itervalues(group['releases']): gr.sort() return module_groups
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return content = utils.gzip_decompress(content) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int(email_utils.mktime_tz( email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _retrieve_member(uri, member_id, html_parser): content = utils.read_uri(uri) if not content: return {} content = six.text_type(content, 'utf8') member = {} for rec in re.finditer(NAME_AND_DATE_PATTERN, content): result = rec.groupdict() member['member_id'] = member_id member['member_name'] = strip_garbage(result['member_name']) member['date_joined'] = result['date_joined'] member['member_uri'] = uri break member['company_draft'] = '*independent' for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() member['company_draft'] = strip_garbage( html_parser.unescape(result['company_draft'])) return member
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return content = utils.gzip_decompress(content) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int( email_utils.mktime_tz(email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _read_module_groups(program_list_uri): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = [] modules_by_types = collections.defaultdict(list) for name, info in six.iteritems(content): group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() all_modules = [] for project_type, project_list in six.iteritems(info['projects']): module_list = [s.split('/')[1] for s in project_list] modules_by_types[project_type] += module_list all_modules += module_list module_groups.append( _make_module_group(group_id, name, all_modules, 'program')) all_modules = [] for project_type, modules_list in six.iteritems(modules_by_types): all_modules += modules_list module_groups.append( _make_module_group('official-%s' % project_type, project_type.capitalize(), modules_list, 'project_type')) module_groups.append( _make_module_group('official-all', 'OpenStack', all_modules, 'project_type')) return module_groups
def _get_mail_archive_links(uri): content = utils.read_uri(uri) links = set( re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content, flags=re.IGNORECASE)) return [parse.urljoin(uri, link) for link in links]
def _read_module_groups(program_list_uri): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = [] modules_by_types = collections.defaultdict(list) for name, info in six.iteritems(content): group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() all_modules = [] for project_type, project_list in six.iteritems(info['projects']): module_list = [s.split('/')[1] for s in project_list] modules_by_types[project_type] += module_list all_modules += module_list module_groups.append(_make_module_group( group_id, name, all_modules, 'program')) all_modules = [] for project_type, modules_list in six.iteritems(modules_by_types): all_modules += modules_list module_groups.append( _make_module_group( 'official-%s' % project_type, project_type.capitalize(), modules_list, 'project_type')) module_groups.append(_make_module_group( 'official-all', 'OpenStack', all_modules, 'project_type')) return module_groups
def _retrieve_member(uri, member_id): content = utils.read_uri(uri) if not content: return {} member = {} for rec in re.finditer(NAME_AND_DATE_PATTERN, content): result = rec.groupdict() member['member_id'] = member_id member['member_name'] = result['member_name'] member['date_joined'] = result['date_joined'] member['member_uri'] = uri break member['company_draft'] = '*independent' for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() member['company_draft'] = result['company_draft'] return member
def _retrieve_mails(uri): LOG.debug("Retrieving mail archive from uri: %s", uri) content = utils.read_uri(uri) if not content: LOG.error("Error reading mail archive from uri: %s", uri) return gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() LOG.debug("Mail archive is loaded, start processing") content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email["author_email"] = email["author_email"].replace(" at ", "@", 1) if not utils.check_email_validity(email["author_email"]): continue email["date"] = int(email_utils.mktime_tz(email_utils.parsedate_tz(email["date"]))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email["body"]): groups = item.groupdict() item_id = groups["id"] if "module" in groups: item_id = groups["module"] + ":" + item_id email["module"] = groups["module"] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _get_mail_archive_links(uri): content = utils.read_uri(uri) if not content: LOG.warning('Mail archive list is not found at %s', uri) return [] links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content, flags=re.IGNORECASE)) return [parse.urljoin(uri, link) for link in links]
def _read_official_projects_yaml(project_list_uri, release_names): LOG.debug('Process list of projects from uri: %s', project_list_uri) content = yaml.safe_load(utils.read_uri(project_list_uri)) module_groups = collections.defaultdict( lambda: { 'modules': [], 'releases': collections.defaultdict(list) }) official_integrated = module_groups['official-integrated'] official_integrated['tag'] = 'project_type' official_integrated['module_group_name'] = 'official-integrated' official_other = module_groups['official-other'] official_other['tag'] = 'project_type' official_other['module_group_name'] = 'official-other' for name, info in six.iteritems(content): # take one official project group_id = '%s-group' % name.lower() module_groups[group_id]['module_group_name'] = '%s Official' % name module_groups[group_id]['tag'] = 'program' for module in info['projects']: repo_split = module['repo'].split('/') if len(repo_split) < 2: continue # valid repo must be in form of 'org/module' module_name = repo_split[1] module_groups[group_id]['modules'].append(module_name) type_matched = False if 'tags' in module: for tag in module.get('tags'): tag_name = tag.get('name') if tag_name == 'integrated-release': type_matched = True # project type is matched here project_type = 'official-other' for release_name in release_names: if release_name == tag.get('since'): project_type = 'official-integrated' module_groups[project_type]['releases'][ release_name].append(module_name) if not type_matched: module_groups['official-other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def _get_mail_archive_links(uri): content = utils.read_uri(uri) if not content: LOG.warning('Mail archive list is not found at %s', uri) return [] links = set( re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(?:\.gz)?)', content, flags=re.IGNORECASE)) return [parse.urljoin(uri, link) for link in links]
def _read_official_programs_yaml(program_list_uri, release_names): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = collections.defaultdict( lambda: { 'modules': [], 'releases': collections.defaultdict(list) }) official_integrated = module_groups['official-integrated'] official_integrated['tag'] = 'project_type' official_integrated['module_group_name'] = 'official-integrated' official_incubated = module_groups['official-incubated'] official_incubated['tag'] = 'project_type' official_incubated['module_group_name'] = 'official-incubated' official_other = module_groups['official-other'] official_other['tag'] = 'project_type' official_other['module_group_name'] = 'official-other' for name, info in six.iteritems(content): # for one program group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() module_groups[group_id]['module_group_name'] = name module_groups[group_id]['tag'] = 'program' for module in info['projects']: module_name = module['repo'].split('/')[1] module_groups[group_id]['modules'].append(module_name) if ('integrated-since' in module) or ('incubated-since' in module): project_type = 'official-other' for release_name in release_names: if release_name == module.get('incubated-since'): project_type = 'official-incubated' elif release_name == module.get('integrated-since'): project_type = 'official-integrated' module_groups[project_type]['releases'][ release_name].append(module_name) else: module_groups['official-other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def _read_official_projects_yaml(project_list_uri, release_names): LOG.debug('Process list of projects from uri: %s', project_list_uri) content = yaml.safe_load(utils.read_uri(project_list_uri)) module_groups = collections.defaultdict( lambda: {'modules': [], 'releases': collections.defaultdict(list)}) official_integrated = module_groups['official-integrated'] official_integrated['tag'] = 'project_type' official_integrated['module_group_name'] = 'official-integrated' official_other = module_groups['official-other'] official_other['tag'] = 'project_type' official_other['module_group_name'] = 'official-other' for name, info in six.iteritems(content): # take one official project group_id = '%s-group' % name.lower() module_groups[group_id]['module_group_name'] = '%s Official' % name module_groups[group_id]['tag'] = 'program' for module in info['projects']: repo_split = module['repo'].split('/') if len(repo_split) < 2: continue # valid repo must be in form of 'org/module' module_name = repo_split[1] module_groups[group_id]['modules'].append(module_name) type_matched = False if 'tags' in module: for tag in module.get('tags'): tag_name = tag.get('name') if tag_name == 'integrated-release': type_matched = True # project type is matched here project_type = 'official-other' for release_name in release_names: if release_name == tag.get('since'): project_type = 'official-integrated' module_groups[project_type]['releases'][ release_name].append(module_name) if not type_matched: module_groups['official-other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def _read_official_programs_yaml(program_list_uri, release_names): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = collections.defaultdict( lambda: {'modules': [], 'releases': collections.defaultdict(list)}) official_integrated = module_groups['official-integrated'] official_integrated['tag'] = 'project_type' official_integrated['module_group_name'] = 'official-integrated' official_incubated = module_groups['official-incubated'] official_incubated['tag'] = 'project_type' official_incubated['module_group_name'] = 'official-incubated' official_other = module_groups['official-other'] official_other['tag'] = 'project_type' official_other['module_group_name'] = 'official-other' for name, info in six.iteritems(content): # for one program group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() module_groups[group_id]['module_group_name'] = name module_groups[group_id]['tag'] = 'program' for module in info['projects']: module_name = module['repo'].split('/')[1] module_groups[group_id]['modules'].append(module_name) if ('integrated-since' in module) or ('incubated-since' in module): project_type = 'official-other' for release_name in release_names: if release_name == module.get('incubated-since'): project_type = 'official-incubated' elif release_name == module.get('integrated-since'): project_type = 'official-integrated' module_groups[project_type]['releases'][ release_name].append(module_name) else: module_groups['official-other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return # only gunzip if the uri has a .gz suffix matchgz = re.compile ('\.txt\.gz') if matchgz.search(uri): LOG.debug ('%s is a gzipped file', uri) gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() else: LOG.debug ('%s is not a gzipped file', uri) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int(email_utils.mktime_tz( email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in MESSAGE_PATTERNS.iteritems(): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def process_official_list(releases): module_groups = _make_default_module_groups() releases_with_refs = (r for r in releases if r.get('refs')) for release in releases_with_refs: ref_governance = release['refs'].get('governance') if not ref_governance: continue gov_type = ref_governance['type'] gov_source = ref_governance['source'] release_name = release['release_name'].lower() LOG.debug('Process governance content from uri: %s', gov_source) content = yaml.safe_load(utils.read_uri(gov_source)) GOVERNANCE_PROCESSORS[gov_type](module_groups, release_name, content) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def read_projects_yaml(project_list_uri): LOG.debug('Process list of projects from uri: %s', project_list_uri) content = yaml.safe_load(utils.read_uri(project_list_uri)) module_groups = collections.defaultdict(lambda: {'modules': []}) all_official = _make_module_group(module_groups, 'openstack-official') for tag in TAGS: _make_module_group(module_groups, tag) for name, project in six.iteritems(content): group_id = '%s-group' % name.lower() module_groups[group_id]['module_group_name'] = '%s Official' % name module_groups[group_id]['tag'] = 'program' for d_name, deliverable in six.iteritems(project['deliverables']): for repo in deliverable['repos']: repo_split = repo.split('/') if len(repo_split) < 2: continue # valid repo must be in form of 'org/module' module_name = repo_split[1] module_groups[group_id]['modules'].append(module_name) all_official['modules'].append(module_name) tags = deliverable.get('tags', []) for tag in tags: if tag in TAGS: module_groups[tag]['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id group['modules'].sort() return module_groups
def _get_mail_archive_links(uri): content = utils.read_uri(uri) links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(\.gz)?)', content, flags=re.IGNORECASE)) # each link is a tuple due to having multiple groups in the re -- get the first one return [urlparse.urljoin(uri, link[0]) for link in links]
def _get_mail_archive_links(uri): content = utils.read_uri(uri) links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt\.gz)', content, flags=re.IGNORECASE)) return [parse.urljoin(uri, link) for link in links]
def zanata_get_users(user_list_uri): return yaml.safe_load(utils.read_uri(user_list_uri))