def _retrieve_mails(uri): LOG.debug('Retrieving mail archive from uri: %s', uri) content = utils.read_uri(uri) if not content: LOG.error('Error reading mail archive from uri: %s', uri) return # only gunzip if the uri has a .gz suffix matchgz = re.compile('\.txt\.gz') if matchgz.search(uri): LOG.debug('%s is a gzipped file', uri) gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() else: LOG.debug('%s is not a gzipped file', uri) LOG.debug('Mail archive is loaded, start processing') content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email['author_email'] = email['author_email'].replace(' at ', '@', 1) if not utils.check_email_validity(email['author_email']): continue email['date'] = int( email_utils.mktime_tz(email_utils.parsedate_tz(email['date']))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email['body']): groups = item.groupdict() item_id = groups['id'] if 'module' in groups: item_id = groups['module'] + ':' + item_id email['module'] = groups['module'] collection.add(item_id) email[pattern_name] = list(collection) yield email
def _retrieve_mails(uri): LOG.debug("Retrieving mail archive from uri: %s", uri) content = utils.read_uri(uri) if not content: LOG.error("Error reading mail archive from uri: %s", uri) return # only gunzip if the uri has a .gz suffix matchgz = re.compile("\.txt\.gz") if matchgz.search(uri): LOG.debug("%s is a gzipped file", uri) gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content)) content = gzip_fd.read() else: LOG.debug("%s is not a gzipped file", uri) LOG.debug("Mail archive is loaded, start processing") content += TRAILING_RECORD for rec in re.finditer(MAIL_BOX_PATTERN, content): email = rec.groupdict() email["author_email"] = email["author_email"].replace(" at ", "@", 1) if not utils.check_email_validity(email["author_email"]): continue email["date"] = int(email_utils.mktime_tz(email_utils.parsedate_tz(email["date"]))) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, email["body"]): groups = item.groupdict() item_id = groups["id"] if "module" in groups: item_id = groups["module"] + ":" + item_id email["module"] = groups["module"] collection.add(item_id) email[pattern_name] = list(collection) yield email
def test_email_invalid(self): self.assertFalse(utils.check_email_validity('pupkin@localhost')) self.assertFalse(utils.check_email_validity('222@some.(trash)'))
def test_email_valid(self): self.assertTrue(utils.check_email_validity('*****@*****.**')) self.assertTrue(utils.check_email_validity('*****@*****.**'))
def log(self, branch, head_commit_id): LOG.debug('Parsing git log for repo uri %s', self.repo['uri']) os.chdir(self.folder) if not self._checkout(branch): return commit_range = 'HEAD' if head_commit_id: commit_range = head_commit_id + '..HEAD' output = sh.git('log', '--pretty=%s' % GIT_LOG_FORMAT, '--shortstat', '-M', '--no-merges', commit_range, _tty_out=False, _decode_errors='ignore') for rec in re.finditer(GIT_LOG_PATTERN, str(output)): i = 1 commit = {} for param in GIT_LOG_PARAMS: commit[param[0]] = six.text_type(rec.group(i), 'utf8') i += 1 if not utils.check_email_validity(commit['author_email']): continue commit['files_changed'] = int(rec.group(i)) i += 1 lines_changed_group = rec.group(i) i += 1 lines_changed = rec.group(i) i += 1 deleted_or_inserted = rec.group(i) i += 1 lines_deleted = rec.group(i) i += 1 if lines_changed_group: # there inserted or deleted lines if not lines_deleted: if deleted_or_inserted[0] == 'd': # deleted lines_deleted = lines_changed lines_changed = 0 commit['lines_added'] = int(lines_changed or 0) commit['lines_deleted'] = int(lines_deleted or 0) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, commit['message']): collection.add(item.group('id')) if collection: commit[pattern_name] = list(collection) commit['date'] = int(commit['date']) commit['module'] = self.repo['module'] commit['branches'] = set([branch]) if commit['commit_id'] in self.release_index: commit['release'] = self.release_index[commit['commit_id']] else: commit['release'] = None if 'blueprint_id' in commit: commit['blueprint_id'] = [(commit['module'] + ':' + bp_name) for bp_name in commit['blueprint_id']] if 'coauthor' in commit: verified_coauthors = [] for coauthor in commit['coauthor']: m = re.match(CO_AUTHOR_PATTERN, coauthor) if m and utils.check_email_validity( m.group("author_email")): verified_coauthors.append(m.groupdict()) if verified_coauthors: commit['coauthor'] = verified_coauthors else: del commit['coauthor'] # no valid authors yield commit
def log(self, branch, head_commit_id): LOG.debug('Parsing git log for repo uri %s', self.repo['uri']) os.chdir(self.folder) if not self._checkout(branch): return commit_range = 'HEAD' if head_commit_id: commit_range = head_commit_id + '..HEAD' output = sh.git('log', '--pretty=%s' % GIT_LOG_FORMAT, '--shortstat', '-M', '--no-merges', commit_range, _tty_out=False, _decode_errors='ignore') for rec in re.finditer(GIT_LOG_PATTERN, str(output)): i = 1 commit = {} for param in GIT_LOG_PARAMS: commit[param[0]] = six.text_type(rec.group(i), 'utf8') i += 1 if not utils.check_email_validity(commit['author_email']): continue commit['files_changed'] = int(rec.group(i)) i += 1 lines_changed_group = rec.group(i) i += 1 lines_changed = rec.group(i) i += 1 deleted_or_inserted = rec.group(i) i += 1 lines_deleted = rec.group(i) i += 1 if lines_changed_group: # there inserted or deleted lines if not lines_deleted: if deleted_or_inserted[0] == 'd': # deleted lines_deleted = lines_changed lines_changed = 0 commit['lines_added'] = int(lines_changed or 0) commit['lines_deleted'] = int(lines_deleted or 0) for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS): collection = set() for item in re.finditer(pattern, commit['message']): collection.add(item.group('id')) if collection: commit[pattern_name] = list(collection) commit['date'] = int(commit['date']) commit['module'] = self.repo['module'] commit['branches'] = set([branch]) if commit['commit_id'] in self.release_index: commit['release'] = self.release_index[commit['commit_id']] else: commit['release'] = None if 'blueprint_id' in commit: commit['blueprint_id'] = [(commit['module'] + ':' + bp_name) for bp_name in commit['blueprint_id'] ] if 'coauthor' in commit: verified_coauthors = [] for coauthor in commit['coauthor']: m = re.match(CO_AUTHOR_PATTERN, coauthor) if m and utils.check_email_validity( m.group("author_email")): verified_coauthors.append(m.groupdict()) if verified_coauthors: commit['coauthor'] = verified_coauthors else: del commit['coauthor'] # no valid authors yield commit