def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern, 'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format( k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format( res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches
def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format(k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches
def get_summary(self, repo_url, otype, number): """Collect all the summary data for issues or pull requests ids Args: repo_url (str): repository URL otype (str): issue or pullRequest number (str): Identifies the pull-request or issue, for example: 12345 """ owner = repo_url.split(u'/', 1)[0] repo = repo_url.split(u'/', 1)[1] template = self.environment.from_string(QUERY_TEMPLATE_SINGLE_NODE) query = template.render(OWNER=owner, REPO=repo, OBJECT_TYPE=otype, OBJECT_PARAMS='number: %s' % number, FIELDS=QUERY_FIELDS) payload = { u'query': to_bytes(query, 'ascii', 'ignore').strip(), u'variables': u'{}', u'operationName': None } if six.PY3: payload[u'query'] = to_text(payload[u'query'], 'ascii') rr = requests.post(self.baseurl, headers=self.headers, data=json.dumps(payload)) data = rr.json() node = data[u'data'][u'repository'][otype] if node is None: return self.update_node(node, otype, owner, repo) return node
def get_cached_request(self, url): '''Use a combination of sqlite and ondisk caching to GET an api resource''' url_parts = url.split('/') cdf = os.path.join(self.cached_requests_dir, url.replace('https://', '') + '.json.gz') cdd = os.path.dirname(cdf) if not os.path.exists(cdd): os.makedirs(cdd) # FIXME - commits are static and can always be used from cache. if url_parts[-2] == 'commits' and os.path.exists(cdf): with gzip.open(cdf, 'r') as f: data = json.loads(f.read()) return data headers = { u'Accept': u','.join(self.accepts_headers), u'Authorization': u'Bearer %s' % self.token, } meta = ADB.get_github_api_request_meta(url, token=self.token) if meta is None: meta = {} # https://developer.github.com/v3/#conditional-requests etag = meta.get('etag') if etag and os.path.exists(cdf): headers['If-None-Match'] = etag rr = requests.get(url, headers=headers) if rr.status_code == 304: # not modified with open(cdf, 'r') as f: data = json.loads(f.read()) else: data = rr.json() # handle ratelimits ... if isinstance(data, dict) and data.get(u'message'): if data[u'message'].lower().startswith( u'api rate limit exceeded'): raise RateLimitError() # cache data to disk logging.debug('write %s' % cdf) with gzip.open(cdf, 'w') as f: f.write(to_bytes(json.dumps(data))) # save the meta ADB.set_github_api_request_meta(url, rr.headers, cdf, token=self.token) # pagination if hasattr(rr, u'links') and rr.links and rr.links.get(u'next'): _data = self.get_request(rr.links[u'next'][u'url']) data += _data return data
def get_summary(self, repo_url, otype, number): """Collect all the summary data for issues or pull requests ids Args: repo_url (str): repository URL otype (str): issue or pullRequest number (str): Identifies the pull-request or issue, for example: 12345 """ owner = repo_url.split(u'/', 1)[0] repo = repo_url.split(u'/', 1)[1] template = self.environment.from_string(QUERY_TEMPLATE_SINGLE_NODE) query = template.render(OWNER=owner, REPO=repo, OBJECT_TYPE=otype, OBJECT_PARAMS='number: %s' % number, FIELDS=QUERY_FIELDS) payload = { u'query': to_bytes(query, 'ascii', 'ignore').strip(), u'variables': u'{}', u'operationName': None } if six.PY3: payload[u'query'] = to_text(payload[u'query'], 'ascii') rr = requests.post(self.baseurl, headers=self.headers, data=json.dumps(payload)) data = rr.json() node = data[u'data'][u'repository'][otype] if node is None: return self.update_node(node, otype, owner, repo) return node
def get_usernames_from_filename_blame(self, owner, repo, branch, filepath): template = self.environment.from_string(QUERY_TEMPLATE_BLAME) committers = defaultdict(set) emailmap = {} query = template.render(OWNER=owner, REPO=repo, BRANCH=branch, PATH=filepath) payload = { u'query': to_text( to_bytes(query, 'ascii', 'ignore'), 'ascii', ).strip(), u'variables': u'{}', u'operationName': None } response = self.requests(payload) data = response.json() nodes = data[u'data'][u'repository'][u'ref'][u'target'][u'blame'][ u'ranges'] """ [ 'commit': { 'oid': 'a3132e5dd6acc526ce575f6db134169c7090f72d', 'author': { 'email': '*****@*****.**', 'user': {'login': '******'} } } ] """ for node in nodes: node = node[u'commit'] if not node[u'author'][u'user']: continue github_id = node[u'author'][u'user'][u'login'] committers[github_id].add(node[u'oid']) # emails come from 'git log --follow' but all github id aren't fetch: # - GraphQL/git 'blame' don't list all commits # - GraphQL 'history' neither because 'history' is like 'git log' but without '--follow' email = node[u'author'].get(u'email') if email and email not in emailmap: emailmap[email] = github_id for github_id, commits in committers.items(): committers[github_id] = list(commits) return committers, emailmap
def get_usernames_from_filename_blame(self, owner, repo, branch, filepath): template = self.environment.from_string(QUERY_TEMPLATE_BLAME) committers = defaultdict(set) emailmap = {} query = template.render(OWNER=owner, REPO=repo, BRANCH=branch, PATH=filepath) payload = { u'query': to_text( to_bytes(query, 'ascii', 'ignore'), 'ascii', ).strip(), u'variables': u'{}', u'operationName': None } response = self.requests(payload) data = response.json() nodes = data[u'data'][u'repository'][u'ref'][u'target'][u'blame'][u'ranges'] """ [ 'commit': { 'oid': 'a3132e5dd6acc526ce575f6db134169c7090f72d', 'author': { 'email': '*****@*****.**', 'user': {'login': '******'} } } ] """ for node in nodes: node = node[u'commit'] if not node[u'author'][u'user']: continue github_id = node[u'author'][u'user'][u'login'] committers[github_id].add(node[u'oid']) # emails come from 'git log --follow' but all github id aren't fetch: # - GraphQL/git 'blame' don't list all commits # - GraphQL 'history' neither because 'history' is like 'git log' but without '--follow' email = node[u'author'].get(u'email') if email and email not in emailmap: emailmap[email] = github_id for github_id, commits in committers.items(): committers[github_id] = list(commits) return committers, emailmap
def get_test_results(self): if self.state in ('pending', 'inProgress', None): return [], False failed_jobs = [j for j in self.jobs if j['result'] == 'failed'] if not failed_jobs: return [], False results = [] ci_verified = True failed_jobs_with_artifact = 0 for job in failed_jobs: for artifact in self.artifacts: if job['id'] != artifact['source']: continue failed_jobs_with_artifact += 1 for artifact_json in self.get_artifact( artifact['name'], artifact['resource']['downloadUrl']): if not artifact_json['verified']: ci_verified = False result_data = '' for result in artifact_json['results']: result_data += result['message'] + result['output'] results.append({ 'contents': { 'results': artifact_json['results'], }, 'run_id': self.build_id, 'job_id': hashlib.md5(to_bytes(result_data)).hexdigest(), 'path': None, }) if ci_verified and len(failed_jobs) != failed_jobs_with_artifact: ci_verified = False return results, ci_verified
def _write_cache_file(self, cfile, data): with gzip.open(cfile, 'w') as f: f.write(to_bytes(json.dumps(data)))
def extract_template_data(body, issue_class='issue', sections=None): if sections is None: sections = SECTIONS # pointless to parse a null body if not body: return {} # simple find or fuzzy find the sections within the body tdict = find_sections(body) or fuzzy_find_sections(body, sections) if not tdict: return {} # lowercase the keys ndict = {} for k, v in tdict.items(): ku = k.lower() if ku == 'plugin name': ku = 'component name' ndict[ku] = v if ndict != tdict: tdict = ndict.copy() # make a raw component section for later processing component_raw = tdict.get('component name', '') # https://github.com/ansible/ansibullbot/issues/359 if ',' in tdict.get('component name', ''): tdict['component name'] = tdict['component name'].replace(',', '\n') # https://github.com/ansible/ansibullbot/issues/385 if ' and ' in tdict.get('component name', ''): tdict['component name'] = tdict['component name'].replace( ' and ', '\n') # cleanup the sections for k, v in tdict.items(): # remove markdown comments from the sections v = remove_markdown_comments(v) # remove non-ascii chars v = to_text(to_bytes(v, 'ascii', errors='ignore'), 'ascii') # normalize newlines and return chars v = v.replace('\r', '\n') # remove pre-ceding and trailing newlines v = v.strip() # remove trailing hashes while v.endswith('#'): v = v[:-1] # remove pre-ceding and trailing newlines (AGAIN) v = v.strip() # clean more on critical sections if 'step' not in k and 'result' not in k: # https://github.com/ansible/ansible-modules-extras/issues/2262 if k == 'component name': v = v.lower() if k == 'component name' and 'module' in v: if '/modules/' in v or \ 'module_util' in v or \ 'module_utils/' in v or \ 'validate-modules' in v or\ 'module_common' in v: # https://github.com/ansible/ansible/issues/20563 # https://github.com/ansible/ansible/issues/18179 pass else: # some modules have the word "_module" in their name # https://github.com/ansible/ansibullbot/issues/198 # https://github.com/ansible/ansible-modules-core/issues/4159 # https://github.com/ansible/ansible-modules-core/issues/5328 reg = re.compile(r'\S+_module') match = reg.match(v) if match: v = v[match.pos:match.end()] else: # https://github.com/ansible/ansibullbot/issues/385 if 'modules' in v: v = v.replace('modules', ' ') else: v = v.replace('module', ' ') # remove useless chars exclude = None if k == 'component name': exclude = ['__'] v = clean_bad_characters(v, exclude=exclude) # clean up empty lines vlines = v.split('\n') vlines = [x for x in vlines if x.strip()] vlines = [x.strip() for x in vlines if x.strip()] v = '\n'.join(vlines) # remove pre-ceding special chars for bc in ['-', '*']: if v: if v[0] == bc: v = v[1:] v = v.strip() # keep just the first line for types and components if k in ['issue type', 'component name']: if v: vlines = v.split('\n') # https://github.com/ansible/ansible-modules-core/issues/3085 vlines = [x for x in vlines if 'pick one' not in x] v = vlines[0] # https://github.com/ansible/ansible-modules-core/issues/4060 if k in ['issue type']: if '/' in v: v = v.split('/') if k == ['issue type']: v = v[0] else: v = v[-1] v = v.strip() if issue_class == 'issue': if k == 'issue type' and v != 'bug report' and 'bug' in v.lower( ): v = 'bug report' elif k == 'issue type' and v != 'feature idea' and 'feature' in v.lower( ): v = 'feature idea' elif issue_class == 'pullrequest': if k == 'issue type' and v != 'bugfix pull request' and 'bug' in v.lower( ): v = 'bugfix pull request' elif k == 'issue type' and v != 'feature pull request' and 'feature' in v.lower( ): v = 'feature pull request' elif k == 'issue type' and v != 'new module pull request' and 'new module' in v.lower( ): v = 'new module pull request' elif k == 'issue type' and v != 'docs pull request' and 'docs' in v.lower( ): v = 'docs pull request' elif k == 'issue type' and v != 'test pull request' and 'test' in v.lower( ): v = 'test pull request' if v == 'paste below': v = '' # save tdict[k] = v # quick clean and add raw component to the dict component_raw = remove_markdown_comments(component_raw) component_raw = clean_bad_characters(component_raw, exclude=['__']) component_raw = '\n'.join( [x.strip() for x in component_raw.split('\n') if x.strip()]) component_raw = '\n'.join( [x for x in component_raw.split('\n') if not x.startswith('#')]) tdict['component_raw'] = component_raw return tdict
def extract_template_data(body, issue_number=None, issue_class='issue', sections=SECTIONS): # this is the final result to return tdict = {} if not body: return tdict upper_body = body.upper() # make a map of locations where each section starts match_map = {} for section in sections: # http://www.tutorialspoint.com/python/string_find.htm # str.find(str, beg=0 end=len(string)) match = upper_body.find(section) if match != -1: match_map[section] = match if not match_map: return {} # what are the header(s) being used? headers = [] for k, v in match_map.items(): try: before = upper_body[v - 1] after = upper_body[v + len(k)] header = before + u'${section}' + after headers.append(header) except Exception as e: pass # pick the most common header and re-search with it if len(sorted(set(headers))) > 1: choices = sorted(set(headers)) choice_totals = [] for choice in choices: ctotal = len([x for x in headers if x == choice]) choice_totals.append((ctotal, choice)) choice_totals.sort(key=lambda tup: tup[0]) sheader = choice_totals[-1][1] match_map = {} t = Template(sheader) for section in SECTIONS: try: tofind = t.substitute(section=section) except Exception as e: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'substitution failed: %s' % to_text(e)) match = upper_body.find(tofind) if match != -1: match_map[section] = match + 1 # re-do for missing sections with less common header(s) for section in SECTIONS: if section in match_map: continue for choice in choices: t = Template(choice) tofind = t.substitute(section=section) match = upper_body.find(tofind) if match != -1: match_map[section] = match + 1 break if not match_map: return {} elif len(headers) <= 1: if headers and \ (u'#' not in headers[0] and u':' not in headers[0] and u'*' not in headers[0]): return {} else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() # sort mapping by element id match_map = sorted(match_map.items(), key=operator.itemgetter(1)) if match_map and u'ISSUE TYPE' not in [x[0] for x in match_map]: if match_map[0][1] > 10: match_map.insert(0, (u'ISSUE TYPE', 0)) # extract the sections based on their indexes total_indexes = len(match_map) - 1 for idx, x in enumerate(match_map): if x[1] > 0: start_index = x[1] + (len(x[0])) else: start_index = 0 # if last index, slice to the end if idx >= total_indexes: tdict[x[0]] = body[start_index:] else: # slice to the next section stop_index = match_map[idx + 1][1] tdict[x[0]] = body[start_index:stop_index] # lowercase the keys ndict = {} for k, v in six.iteritems(tdict): ku = k.lower() if ku == u'plugin name': ku = u'component name' ndict[ku] = v if ndict != tdict: tdict = ndict.copy() # make a raw component section for later processing component_raw = tdict.get(u'component name', u'') # https://github.com/ansible/ansibullbot/issues/359 if u',' in tdict.get(u'component name', u''): tdict[u'component name'] = tdict[u'component name'].replace( u',', u'\n') # https://github.com/ansible/ansibullbot/issues/385 if u' and ' in tdict.get(u'component name', u''): tdict[u'component name'] = tdict[u'component name'].replace( u' and ', u'\n') # cleanup the sections for k, v in six.iteritems(tdict): # remove markdown comments from the sections v = remove_markdown_comments(v) # remove non-ascii chars v = to_text(to_bytes(v, 'ascii', errors='ignore'), 'ascii') # normalize newlines and return chars v = v.replace(u'\r', u'\n') # remove pre-ceding and trailing newlines v = v.strip() # remove trailing hashes while v.endswith(u'#'): v = v[:-1] # remove pre-ceding and trailing newlines (AGAIN) v = v.strip() # clean more on critical sections if u'step' not in k and u'result' not in k: # https://github.com/ansible/ansible-modules-extras/issues/2262 if k == u'component name': v = v.lower() if k == u'component name' and u'module' in v: if u'/modules/' in v or \ u'module_util' in v or \ u'module_utils/' in v or \ u'validate-modules' in v or\ u'module_common' in v: # https://github.com/ansible/ansible/issues/20563 # https://github.com/ansible/ansible/issues/18179 pass else: # some modules have the word "_module" in their name # https://github.com/ansible/ansibullbot/issues/198 # https://github.com/ansible/ansible-modules-core/issues/4159 # https://github.com/ansible/ansible-modules-core/issues/5328 reg = re.compile(u'\S+_module') match = reg.match(v) if match: v = v[match.pos:match.end()] else: # https://github.com/ansible/ansibullbot/issues/385 if u'modules' in v: v = v.replace(u'modules', u' ') else: v = v.replace(u'module', u' ') # remove useless chars v = clean_bad_characters(v) # clean up empty lines vlines = v.split(u'\n') vlines = [x for x in vlines if x.strip()] vlines = [x.strip() for x in vlines if x.strip()] v = u'\n'.join(vlines) # remove pre-ceding special chars for bc in [u'-', u'*']: if v: if v[0] == bc: v = v[1:] v = v.strip() # keep just the first line for types and components if k in [u'issue type', u'component name']: if v: vlines = v.split(u'\n') # https://github.com/ansible/ansible-modules-core/issues/3085 vlines = [x for x in vlines if u'pick one' not in x] v = vlines[0] # https://github.com/ansible/ansible-modules-core/issues/4060 if k in [u'issue type']: if u'/' in v: v = v.split(u'/') if k == [u'issue type']: v = v[0] else: v = v[-1] v = v.strip() if issue_class == u'issue': if k == u'issue type' and v != u'bug report' and u'bug' in v.lower( ): v = u'bug report' elif k == u'issue type' and v != u'feature idea' and u'feature' in v.lower( ): v = u'feature idea' elif issue_class == u'pullrequest': if k == u'issue type' and v != u'bugfix pull request' and u'bug' in v.lower( ): v = u'bugfix pull request' elif k == u'issue type' and v != u'feature pull request' and u'feature' in v.lower( ): v = u'feature pull request' elif k == u'issue type' and v != u'new module pull request' and u'new module' in v.lower( ): v = u'new module pull request' elif k == u'issue type' and v != u'docs pull request' and u'docs' in v.lower( ): v = u'docs pull request' elif k == u'issue type' and v != u'test pull request' and u'test' in v.lower( ): v = u'test pull request' # save tdict[k] = v # quick clean and add raw component to the dict component_raw = remove_markdown_comments(component_raw) component_raw = clean_bad_characters(component_raw, exclude=None) component_raw = u'\n'.join( [x.strip() for x in component_raw.split(u'\n') if x.strip()]) component_raw = u'\n'.join( [x for x in component_raw.split(u'\n') if not x.startswith(u'#')]) tdict[u'component_raw'] = component_raw return tdict
def main(): initialize_sentry() # define where to dump the resulting files if len(sys.argv) > 1: destdir = sys.argv[1] else: destdir = '/tmp' if not os.path.isdir(destdir): os.makedirs(destdir) ISSUES = {} BYFILE = {} BYISSUE = {} BYMAINTAINER = {} summaries = get_receiver_summaries('ansible', 'ansible', state='open') for summary in summaries: number = summary['github_number'] this_meta = get_receiver_metadata('ansible', 'ansible', number=number) if not this_meta: continue this_meta = this_meta[0] url = this_meta['html_url'] ISSUES[url] = this_meta BYISSUE[url] = [] try: this_meta.get('component_matches', []) except Exception as e: print(e) continue for component in this_meta.get('component_matches', []): # we seem to have some variation in the keys ... filename = None try: filename = component['repo_filename'] except KeyError: filename = component['filename'] if not filename: continue if 'maintainers' in component: for maintainer in component['maintainers']: if maintainer not in BYMAINTAINER: BYMAINTAINER[maintainer] = [] if url not in BYMAINTAINER[maintainer]: BYMAINTAINER[maintainer].append(url) BYISSUE[url].append(filename) if filename not in BYFILE: BYFILE[filename] = [] if url not in BYFILE[filename]: BYFILE[filename].append(url) destfile = os.path.join(destdir, 'byissue.json') with open(destfile, 'w') as f: f.write(json.dumps(BYISSUE, indent=2, sort_keys=True)) destfile = os.path.join(destdir, 'byfile.json') with open(destfile, 'w') as f: f.write(json.dumps(BYFILE, indent=2, sort_keys=True)) tuples = list(BYFILE.items()) for idx, x in enumerate(tuples): x = [x[0]] + x[1] tuples[idx] = x tuples.sort(key=len) tuples.reverse() destfile = os.path.join(destdir, 'byfile_sorted.txt') with open(destfile, 'wb') as f: for tup in tuples: f.write(b'%s\n' % to_bytes(tup[0])) for issue in tup[1:]: title = to_bytes(ISSUES[issue]['title']) f.write(b'\t%s\t%s\n' % (to_bytes(issue), title)) destfile = os.path.join(destdir, 'byfile_sorted.html') with open(destfile, 'wb') as f: for idp, tup in enumerate(tuples): f.write( b'<div style="background-color: #cfc ; padding: 10px; border: 1px solid green;">\n' ) file_ref = b'%d. <a href="https://github.com/ansible/ansible/blob/devel/%s">https://github.com/ansible/ansible/blob/devel/%s</a> %d total' % ( (idp + 1), to_bytes(tup[0]), to_bytes(tup[0]), len(tup[1:])) f.write(b'%s\n' % (file_ref)) f.write(b'</div>') f.write(b'<br>\n') for issue in tup[1:]: title = to_bytes(ISSUES[issue]['title']) issue = to_bytes(issue) issue_ref = b'<a href="%s">%s</a>' % (issue, issue) f.write(b'\t%s\t%s<br>\n' % (issue_ref, title)) f.write(b'<br>\n') tuples = list(BYMAINTAINER.items()) for idx, x in enumerate(tuples): x = [x[0]] + x[1] tuples[idx] = x tuples.sort(key=len) tuples.reverse() destfile = os.path.join(destdir, 'bymaintainer.json') with open(destfile, 'w') as f: f.write(json.dumps(BYMAINTAINER, indent=2, sort_keys=True)) destfile = os.path.join(destdir, 'bymaintainer_sorted.txt') with open(destfile, 'wb') as f: for tup in tuples: f.write(b'%s\n' % to_bytes(tup[0])) for issue in tup[1:]: f.write(b'\t%s\n' % to_bytes(issue))
def main(): initialize_sentry() # define where to dump the resulting files if len(sys.argv) > 1: destdir = sys.argv[1] else: destdir = '/tmp' if not os.path.isdir(destdir): os.makedirs(destdir) ISSUES = {} BYFILE = {} BYISSUE = {} BYMAINTAINER = {} summaries = get_receiver_summaries('ansible', 'ansible', state='open') for summary in summaries: number = summary['github_number'] this_meta = get_receiver_metadata('ansible', 'ansible', number=number) if not this_meta: continue this_meta = this_meta[0] url = this_meta['html_url'] ISSUES[url] = this_meta BYISSUE[url] = [] try: this_meta.get('component_matches', []) except Exception as e: print(e) continue for component in this_meta.get('component_matches', []): # we seem to have some variation in the keys ... filename = None try: filename = component['repo_filename'] except KeyError: filename = component['filename'] if not filename: continue if 'maintainers' in component: for maintainer in component['maintainers']: if maintainer not in BYMAINTAINER: BYMAINTAINER[maintainer] = [] if url not in BYMAINTAINER[maintainer]: BYMAINTAINER[maintainer].append(url) BYISSUE[url].append(filename) if filename not in BYFILE: BYFILE[filename] = [] if url not in BYFILE[filename]: BYFILE[filename].append(url) destfile = os.path.join(destdir, 'byissue.json') with open(destfile, 'w') as f: f.write(json.dumps(BYISSUE, indent=2, sort_keys=True)) destfile = os.path.join(destdir, 'byfile.json') with open(destfile, 'w') as f: f.write(json.dumps(BYFILE, indent=2, sort_keys=True)) tuples = BYFILE.items() for idx, x in enumerate(tuples): x = [x[0]] + x[1] tuples[idx] = x tuples.sort(key=len) tuples.reverse() destfile = os.path.join(destdir, 'byfile_sorted.txt') with open(destfile, 'wb') as f: for tup in tuples: f.write(b'%s\n' % to_bytes(tup[0])) for issue in tup[1:]: issue = to_bytes(issue) title = to_bytes(ISSUES[issue]['title']) f.write(b'\t%s\t%s\n' % (issue, title)) destfile = os.path.join(destdir, 'byfile_sorted.html') with open(destfile, 'wb') as f: for idp, tup in enumerate(tuples): f.write(b'<div style="background-color: #cfc ; padding: 10px; border: 1px solid green;">\n') file_ref = b'%s. <a href="https://github.com/ansible/ansible/blob/devel/%s">https://github.com/ansible/ansible/blob/devel/%s</a> %s total' % ( (idp+1), to_bytes(tup[0]), to_bytes(tup[0]), len(tup[1:]) ) f.write(b'%s\n' % (file_ref)) f.write(b'</div>') f.write(b'<br>\n') for issue in tup[1:]: issue = to_bytes(issue) title = to_bytes(ISSUES[issue]['title']) issue_ref = b'<a href="%s">%s</a>' % (issue, issue) f.write(b'\t%s\t%s<br>\n' % (issue_ref, title)) f.write(b'<br>\n') tuples = BYMAINTAINER.items() for idx, x in enumerate(tuples): x = [x[0]] + x[1] tuples[idx] = x tuples.sort(key=len) tuples.reverse() destfile = os.path.join(destdir, 'bymaintainer.json') with open(destfile, 'w') as f: f.write(json.dumps(BYMAINTAINER, indent=2, sort_keys=True)) destfile = os.path.join(destdir, 'bymaintainer_sorted.txt') with open(destfile, 'wb') as f: for tup in tuples: f.write(b'%s\n' % to_bytes(tup[0])) for issue in tup[1:]: f.write(b'\t%s\n' % to_bytes(issue))
def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [ x for x in known_modules if title.startswith(x + u' ') ] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [ x for x in known_modules if fnmatch.fnmatch(x, component) ] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [ x for x in cmatches if x in title_matches and x not in [u'at'] ] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match
def write_gzip_json_file(path, data): with gzip.open(path, 'w') as f: f.write(to_bytes(json.dumps(data)))
def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii')) ) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [x for x in known_modules if title.startswith(x + u' ')] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [x for x in known_modules if fnmatch.fnmatch(x, component)] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [x for x in cmatches if x in title_matches and x not in [u'at']] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match
def get_summaries(self, owner, repo, otype='issues', last=None, first='first: 100', states='states: OPEN', paginate=True): """Collect all the summary data for issues or pullreuests Args: owner (str): the github namespace repo (str): the github repository otype (str): issues or pullRequests first (str): number of nodes per page, oldest to newest last (str): number of nodes per page, newest to oldest states (str): open or closed issues paginate (bool): recurse through page results """ templ = self.environment.from_string(QUERY_TEMPLATE) # after: "$endCursor" after = None ''' # first: 100 first = 'first: 100' # states: OPEN states = 'states: OPEN' ''' nodes = [] pagecount = 0 while True: logging.debug(u'%s/%s %s pagecount:%s nodecount: %s' % (owner, repo, otype, pagecount, len(nodes))) issueparams = u', '.join([x for x in [states, first, last, after] if x]) query = templ.render(OWNER=owner, REPO=repo, OBJECT_TYPE=otype, OBJECT_PARAMS=issueparams, FIELDS=QUERY_FIELDS) payload = { u'query': to_bytes(query, 'ascii', 'ignore').strip(), u'variables': u'{}', u'operationName': None } rr = requests.post(self.baseurl, headers=self.headers, data=json.dumps(payload)) if not rr.ok: break data = rr.json() if not data: break # keep each edge/node/issue for edge in data[u'data'][u'repository'][otype][u'edges']: node = edge[u'node'] self.update_node(node, otype.lower()[:-1], owner, repo) nodes.append(node) if not paginate: break pageinfo = data.get(u'data', {}).get(u'repository', {}).get(otype, {}).get(u'pageInfo') if not pageinfo: break if not pageinfo.get(u'hasNextPage'): break after = u'after: "%s"' % pageinfo[u'endCursor'] pagecount += 1 return nodes