def _read_official_programs_yaml(program_list_uri, release_names): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = collections.defaultdict( lambda: {'modules': [], 'releases': collections.defaultdict(list)}) bootstrap = module_groups['official-bootstrap'] bootstrap['tag'] = 'project_type' bootstrap['module_group_name'] = 'official-bootstrap' incubation = module_groups['official-incubation'] incubation['tag'] = 'project_type' incubation['module_group_name'] = 'official-incubation' mature = module_groups['official-mature'] mature['tag'] = 'project_type' mature['module_group_name'] = 'official-mature' core = module_groups['official-core'] core['tag'] = 'project_type' core['module_group_name'] = 'official-core' RELEASE_TAGS = ['bootstrapped-since', 'incubated-since', 'mature-since', 'core-since'] for name, info in six.iteritems(content): # for one program group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() module_groups[group_id]['module_group_name'] = name module_groups[group_id]['tag'] = 'program' for module in info['projects']: module_name = module['repo'].split('/')[1] module_groups[group_id]['modules'].append(module_name) project_type = 'official-other' if (any(key in module for key in RELEASE_TAGS)): releases = [r.lower() for r in release_names] for release_name in releases: if release_name == module.get('bootstrapped-since'): project_type = 'official-bootstrap' elif release_name == module.get('incubated-since'): project_type = 'official-incubation' elif release_name == module.get('mature-since'): project_type = 'official-mature' elif release_name == module.get('core-since'): project_type = 'official-core' module_groups[project_type]['releases'][ release_name].append(module_name) else: module_groups['other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups
def _retrieve_member(self, member_id, html_parser): content = utils.read_uri(self.uri) if not content: return {} member = {} for rec in re.finditer(NAME_AND_DATE_PATTERN, content): result = rec.groupdict() member['member_id'] = member_id member['member_name'] = result['member_name'] member['date_joined'] = result['date_joined'] member['member_uri'] = self.uri break member['company_draft'] = '*independent' for rec in re.finditer(COMPANY_PATTERN, content): result = rec.groupdict() company_draft = html_parser.unescape(result['company_draft']) member['company_draft'] = company_draft return member
def _get_mail_archive_links(uri): content = utils.read_uri(uri) links = set( re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(\.gz)?)', content, flags=re.IGNORECASE)) # each link is a tuple due to having multiple groups in the re # we are only interested in the first one return [parse.urljoin(uri, link[0]) for link in links]
def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return # only gunzip if the uri has a .gz suffix matchgz = re.compile('\.txt\.gz') if matchgz.search(uri): LOG.debug('%s is a gzipped file', uri) gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() else: LOG.debug('%s is not a gzipped file', uri) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int( email_utils.mktime_tz(email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _retrieve_mails(uri): LOG.debug("Retrieving mail archive from uri: %s", uri) content = utils.read_uri(uri) if not content: LOG.error("Error reading mail archive from uri: %s", uri) return # only gunzip if the uri has a .gz suffix matchgz = re.compile("\.txt\.gz") if matchgz.search(uri): LOG.debug("%s is a gzipped file", uri) gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() else: LOG.debug("%s is not a gzipped file", uri) LOG.debug("Mail archive is loaded, start processing") content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email["author_email"] = email["author_email"].replace(" at ", "@", 1) if not utils.check_email_validity(email["author_email"]): continue email["date"] = int(email_utils.mktime_tz(email_utils.parsedate_tz(email["date"]))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email["body"]): groups = item.groupdict() item_id = groups["id"] if "module" in groups: item_id = groups["module"] + ":" + item_id email["module"] = groups["module"] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _get_mail_archive_links(uri): content = utils.read_uri(uri) links = set(re.findall(r'\shref\s*=\s*[\'"]([^\'"]*\.txt(\.gz)?)', content, flags=re.IGNORECASE)) # each link is a tuple due to having multiple groups in the re # we are only interested in the first one return [parse.urljoin(uri, link[0]) for link in links]
def _read_official_programs_yaml(program_list_uri, release_names): LOG.debug('Process list of programs from uri: %s', program_list_uri) content = yaml.safe_load(utils.read_uri(program_list_uri)) module_groups = collections.defaultdict( lambda: { 'modules': [], 'releases': collections.defaultdict(list) }) bootstrap = module_groups['official-bootstrap'] bootstrap['tag'] = 'project_type' bootstrap['module_group_name'] = 'official-bootstrap' incubation = module_groups['official-incubation'] incubation['tag'] = 'project_type' incubation['module_group_name'] = 'official-incubation' mature = module_groups['official-mature'] mature['tag'] = 'project_type' mature['module_group_name'] = 'official-mature' core = module_groups['official-core'] core['tag'] = 'project_type' core['module_group_name'] = 'official-core' RELEASE_TAGS = [ 'bootstrapped-since', 'incubated-since', 'mature-since', 'core-since' ] for name, info in six.iteritems(content): # for one program group_id = name.lower() if 'codename' in info: name = '%s (%s)' % (info['codename'], name) group_id = '%s-group' % info['codename'].lower() module_groups[group_id]['module_group_name'] = name module_groups[group_id]['tag'] = 'program' for module in info['projects']: module_name = module['repo'].split('/')[1] module_groups[group_id]['modules'].append(module_name) project_type = 'official-other' if (any(key in module for key in RELEASE_TAGS)): releases = [r.lower() for r in release_names] for release_name in releases: if release_name == module.get('bootstrapped-since'): project_type = 'official-bootstrap' elif release_name == module.get('incubated-since'): project_type = 'official-incubation' elif release_name == module.get('mature-since'): project_type = 'official-mature' elif release_name == module.get('core-since'): project_type = 'official-core' module_groups[project_type]['releases'][ release_name].append(module_name) else: module_groups['other']['modules'].append(module_name) # set ids for module groups for group_id, group in six.iteritems(module_groups): group['id'] = group_id return module_groups