def relocate_pickle_files(self): '''Move files to the correct location to fix bad pathing''' srcdir = os.path.join( self.cachedir, u'issues', to_text(self.instance.number) ) destdir = os.path.join( self.cachedir, to_text(self.instance.number) ) if not os.path.isdir(srcdir): return True if not os.path.isdir(destdir): os.makedirs(destdir) # move the files pfiles = os.listdir(srcdir) for pf in pfiles: src = os.path.join(srcdir, pf) dest = os.path.join(destdir, pf) shutil.move(src, dest) # get rid of the bad dir shutil.rmtree(srcdir)
def create_checkout(self): """checkout ansible""" # cleanup if os.path.isdir(self.checkoutdir): shutil.rmtree(self.checkoutdir) cmd = "git clone %s %s" \ % (self.repo, self.checkoutdir) (rc, so, se) = run_command(cmd) print(to_text(so) + to_text(se))
def pullrequest_filepath_exists(self, filepath): ''' Check if a file exists on the submitters branch ''' # https://github.com/ansible/ansibullbot/issues/406 # https://developer.github.com/v3/repos/contents/ # GET /repos/:owner/:repo/readme # "contents_url": # "https://api.github.com/repos/ganeshrn/ansible/contents/{+path}", # self.pullrequest.head # - ref --> branch name # - repo.full_name sha = self.pullrequest.head.sha pdata = None resp = None cachefile = os.path.join( self.cachedir, u'issues', to_text(self.number), u'shippable_yml.pickle' ) try: if os.path.isfile(cachefile): with open(cachefile, 'rb') as f: pdata = pickle_load(f) except Exception as e: logging.error(u'failed to unpickle %s %s' % (cachefile, to_text(e))) if not pdata or pdata[0] != sha: if self.pullrequest.head.repo: url = self.pullrequest.head.repo.url + u'/contents/' + filepath resp = self.pullrequest._requester.requestJson( u"GET", url, input={u'ref': self.pullrequest.head.ref} ) else: # https://github.com/ansible/ansible/pull/19891 # Sometimes the repo repo/branch has disappeared resp = [None] pdata = [sha, resp] with open(cachefile, 'wb') as f: pickle_dump(pdata, f) else: resp = pdata[1] result = False if resp[0]: result = True return result
def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format(k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches
def test_component_matching(self): print('') AT = AnsibleTriage(args={}) AT.file_indexer.get_files() jfile = 'tests/fixtures/issue_template_meta.json' with open(jfile, 'rb') as f: jdata = json.load(f) keys = sorted([int(x) for x in jdata.keys()]) for key in keys: k = to_text(key) v = jdata[k] if '/pull/' in v['html_url']: continue if not v.get('labels'): continue if 'module' in v['labels']: continue clabels = [x for x in v['labels'] if x.startswith('c:')] #if not clabels: # continue print(v['html_url']) # extract fields from the body td = extract_template_data( v['body'], issue_number=key, issue_class=None ) components = AT.file_indexer.find_component_match( v['title'], v['body'], td ) if components and clabels: comp_labels = AT.file_indexer.get_component_labels( AT.valid_labels, components ) print('\t' + to_text(comp_labels))
def get_files(self): '''Cache a list of filenames in the checkout''' cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self.files = files
def get_single_issue_summary( self, repo_url, number, cachefile=None, force=False ): '''Scrape the summary for a specific issue''' # get cached issues = self.load_summaries(repo_url) if number in issues and not force: return issues[number] else: if repo_url.startswith(u'http'): url = repo_url else: url = self.baseurl + u'/' + repo_url url += u'/issues/' url += to_text(number) rr = self._request_url(url) soup = BeautifulSoup(rr.text, u'html.parser') if soup.text.lower().strip() != u'not found': summary = self.parse_issue_page_to_summary(soup, url=rr.url) if summary: issues[number] = summary if number in issues: return issues[number] else: return {}
def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None): """Return a dict of all issue summaries with numbers as keys Adds a compatibility method for the webscraper Args: repo_url (str): username/repository baseurl (str): not used cachefile (str): not used """ owner = repo_url.split(u'/', 1)[0] repo = repo_url.split(u'/', 1)[1] summaries = self.get_all_summaries(owner, repo) issues = {} for x in summaries: issues[to_text(x[u'number'])] = x # keep the summaries for out of band analysis repodata = { u'user': repo_url.split(u'/', 1)[0], u'repo': repo_url.split(u'/', 1)[1], } post_to_receiver(u'summaries', repodata, issues) return issues
def get_summary(self, repo_url, otype, number): """Collect all the summary data for issues or pull requests ids Args: repo_url (str): repository URL otype (str): issue or pullRequest number (str): Identifies the pull-request or issue, for example: 12345 """ owner = repo_url.split(u'/', 1)[0] repo = repo_url.split(u'/', 1)[1] template = self.environment.from_string(QUERY_TEMPLATE_SINGLE_NODE) query = template.render(OWNER=owner, REPO=repo, OBJECT_TYPE=otype, OBJECT_PARAMS='number: %s' % number, FIELDS=QUERY_FIELDS) payload = { u'query': to_bytes(query, 'ascii', 'ignore').strip(), u'variables': u'{}', u'operationName': None } if six.PY3: payload[u'query'] = to_text(payload[u'query'], 'ascii') rr = requests.post(self.baseurl, headers=self.headers, data=json.dumps(payload)) data = rr.json() node = data[u'data'][u'repository'][otype] if node is None: return self.update_node(node, otype, owner, repo) return node
def post_to_receiver(path, params, data): if not data: return if not C.DEFAULT_RECEIVER_HOST or u'none' in C.DEFAULT_RECEIVER_HOST.lower(): return rr = None if C.DEFAULT_RECEIVER_HOST and data: receiverurl = u'http://' receiverurl += C.DEFAULT_RECEIVER_HOST receiverurl += u':' receiverurl += to_text(C.DEFAULT_RECEIVER_PORT) receiverurl += u'/' receiverurl += path logging.info(u'RECEIVER: POST to %s' % receiverurl) try: rr = requests.post(receiverurl, params=params, json=data) except Exception as e: logging.warning(e) try: if rr is not None: for k, v in rr.json().items(): logging.info(u'RECEIVER: %s %s' % (v, k)) except ValueError as e: logging.debug(u'RECEIVER: status_code = %s' % rr.status_code) logging.warning(e)
def set_missing(self, number): mfile = os.path.join(self.cachedir, u'issues', to_text(number), u'missing') mdir = os.path.dirname(mfile) if not os.path.isdir(mdir): os.makedirs(mdir) with open(mfile, 'wb') as f: f.write('\n')
def merge_commits(self, commits): for xc in commits: ''' # 'Thu, 12 Jan 2017 15:06:46 GMT' tfmt = '%a, %d %b %Y %H:%M:%S %Z' ts = xc.last_modified dts = datetime.datetime.strptime(ts, tfmt) ''' # committer.date: "2016-12-19T08:05:45Z" dts = xc.commit.committer.date adts = pytz.utc.localize(dts) event = {} event[u'id'] = xc.sha if hasattr(xc.committer, u'login'): event[u'actor'] = xc.committer.login else: event[u'actor'] = to_text(xc.committer) #event[u'created_at'] = dts event[u'created_at'] = adts event[u'event'] = u'committed' event[u'message'] = xc.commit.message self.history.append(event) self.fix_history_tz() self.history = sorted(self.history, key=itemgetter(u'created_at'))
def get_pullrequest_runs(self, number): '''All runs for the given PR number''' nruns = [] for x in self.runs: if x[u'commitUrl'].endswith(u'/' + to_text(number)): nruns.append(x) return nruns
def clean_issue_cache(self, number): # https://github.com/ansible/ansibullbot/issues/610 cdir = os.path.join( self.cachedir, u'issues', to_text(number) ) shutil.rmtree(cdir)
def get_files(self): cmd = u'find %s' % self.gitrepo.checkoutdir (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] files = [x.replace(self.gitrepo.checkoutdir + u'/', u'') for x in files] files = [x for x in files if not x.startswith(u'.git')] self.files = files
def get_files(self, force=False): '''Cache a list of filenames in the checkout''' if not self._files or force: cmd = u'cd {}; git ls-files'.format(self.checkoutdir) logging.debug(cmd) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self._files = files
def dump_action_dict(self, issue, actions): '''Serialize the action dict to disk for quick(er) debugging''' fn = os.path.join(u'/tmp', u'actions', issue.repo_full_name, to_text(issue.number) + u'.json') dn = os.path.dirname(fn) if not os.path.isdir(dn): os.makedirs(dn) logging.info('dumping {}'.format(fn)) with open(fn, 'wb') as f: f.write(json.dumps(actions, indent=2, sort_keys=True))
def last_commit_for_file(self, filepath): if filepath in self.commits: return self.commits[filepath][0][u'hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.gitrepo.checkoutdir, filepath) (rc, so, se) = run_command(cmd) return to_text(so).strip()
def get_files_by_commit(self, commit): if commit not in self.files_by_commit: cmd = u'cd {}; git show --pretty="" --name-only {}'.format(self.checkoutdir, commit) (rc, so, se) = run_command(cmd) filenames = [x.strip() for x in to_text(so).split(u'\n') if x.strip()] self.files_by_commit[commit] = filenames[:] else: filenames = self.files_by_commit[commit] return filenames
def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = u'' inphase = False with io.open(module_file, 'r', encoding='utf-8') as f: for line in f: if line.startswith(u'ANSIBLE_METADATA'): inphase = True if line.startswith(u'DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) tmp_meta = {} for k, v in meta.items(): if isinstance(k, six.binary_type): k = to_text(k) if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, list): tmp_list = [] for i in v: if isinstance(i, six.binary_type): i = to_text(i) tmp_list.append(i) v = tmp_list del tmp_list tmp_meta[k] = v meta = tmp_meta del tmp_meta except SyntaxError: pass return meta
def version_by_date(self, dateobj, devel=False): if not self.DATEVERSIONS: self.DATEVERSIONS = [] cmd = u'cd %s;' % self.checkoutdir cmd += u'git log --date=short --pretty=format:"%ad;%H"' (rc, so, se) = run_command(cmd) lines = (x.strip() for x in to_text(so).split(u'\n')) lines = filter(bool, lines) for x in lines: parts = x.split(u';') self.DATEVERSIONS.append(parts) last_commit_date = self.DATEVERSIONS[0][0] last_commit_date = datetime.datetime.strptime( last_commit_date, u'%Y-%m-%d' ) # use last commit version if older than incoming date if dateobj >= last_commit_date: acommit = self.DATEVERSIONS[0][1] else: acommit = None datestr = to_text(dateobj).split()[0] for dv in reversed(self.DATEVERSIONS): if dv[0] == datestr: break if not acommit: datestr = u'-'.join(datestr.split(u'-')[0:2]) for dv in self.DATEVERSIONS: dvs = u'-'.join(dv[0].split(u'-')[0:2]) if dvs == datestr: acommit = dv[1] break aversion = None if acommit: aversion = self.ansible_version_by_commit(acommit) return aversion
def update_checkout(self): """rebase + pull + update the checkout""" changed = False cmd = "cd %s ; git pull --rebase" % self.checkoutdir (rc, so, se) = run_command(cmd) so = to_text(so) print(so + to_text(se)) # If rebase failed, recreate the checkout if rc != 0: self.create_checkout() return True else: if u'current branch devel is up to date.' not in so.lower(): changed = True self.commits_by_email = None return changed
def save_pullrequest(self, issue): cfile = os.path.join( self.cachedir, u'issues', to_text(issue.number), u'pullrequest.pickle' ) cdir = os.path.dirname(cfile) if not os.path.isdir(cdir): os.makedirs(cdir) with open(cfile, 'wb') as f: pickle_dump(issue, f)
def clean_list_items(inlist): if isinstance(inlist, list): inlist = to_text(inlist) if u'&' in inlist: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() inlist = inlist.replace(u"[", u'') inlist = inlist.replace(u"]", u'') inlist = inlist.replace(u"'", u'') inlist = inlist.replace(u",", u'') inlist = inlist.split() return inlist
def save_issue(self, issue): cfile = os.path.join( self.cachedir, u'issues', to_text(issue.number), u'issue.pickle' ) cdir = os.path.dirname(cfile) if not os.path.isdir(cdir): os.makedirs(cdir) logging.debug(u'dump %s' % cfile) with open(cfile, 'wb') as f: pickle_dump(issue, f)
def load_issue(self, number): pfile = os.path.join( self.cachedir, u'issues', to_text(number), u'issue.pickle' ) if os.path.isfile(pfile): with open(pfile, 'rb') as f: issue = pickle_load(f) return issue else: return False
def update_checkout(self): """rebase + pull + update the checkout""" changed = False # get a specific commit or do a rebase if self.commit: cmd = "cd %s; git log -1 | head -n1 | awk '{print $2}'" % self.checkoutdir (rc, so, se) = run_command(cmd) so = to_text(so).strip() if so != self.commit: cmd = "cd %s; git checkout %s" % (self.checkoutdir, self.commit) (rc, so, se) = run_command(cmd) changed = True if rc != 0: self.create_checkout() changed = True else: changed = False cmd = "cd %s ; git pull --rebase" % self.checkoutdir (rc, so, se) = run_command(cmd) so = to_text(so) print(so + to_text(se)) # If rebase failed, recreate the checkout if rc != 0: self.create_checkout() return True else: if u'current branch devel is up to date.' not in so.lower(): changed = True self.commits_by_email = None return changed
def get_usernames_from_filename_blame(self, owner, repo, branch, filepath): template = self.environment.from_string(QUERY_TEMPLATE_BLAME) committers = defaultdict(set) emailmap = {} query = template.render(OWNER=owner, REPO=repo, BRANCH=branch, PATH=filepath) payload = { u'query': to_text( to_bytes(query, 'ascii', 'ignore'), 'ascii', ).strip(), u'variables': u'{}', u'operationName': None } response = self.requests(payload) data = response.json() nodes = data[u'data'][u'repository'][u'ref'][u'target'][u'blame'][u'ranges'] """ [ 'commit': { 'oid': 'a3132e5dd6acc526ce575f6db134169c7090f72d', 'author': { 'email': '*****@*****.**', 'user': {'login': '******'} } } ] """ for node in nodes: node = node[u'commit'] if not node[u'author'][u'user']: continue github_id = node[u'author'][u'user'][u'login'] committers[github_id].add(node[u'oid']) # emails come from 'git log --follow' but all github id aren't fetch: # - GraphQL/git 'blame' don't list all commits # - GraphQL 'history' neither because 'history' is like 'git log' but without '--follow' email = node[u'author'].get(u'email') if email and email not in emailmap: emailmap[email] = github_id for github_id, commits in committers.items(): committers[github_id] = list(commits) return committers, emailmap
def save_issue(self): pfile = os.path.join( self.cachedir, u'issues', to_text(self.instance.number), u'issue.pickle' ) pdir = os.path.dirname(pfile) if not os.path.isdir(pdir): os.makedirs(pdir) logging.debug(u'dump %s' % pfile) with open(pfile, 'wb') as f: pickle_dump(self.instance, f)
def test_module_matching(self): print('') AT = AnsibleTriage(args={}) jfile = 'tests/fixtures/issue_template_meta.json' with open(jfile, 'rb') as f: jdata = json.load(f) keys = sorted([int(x) for x in jdata.keys()]) for key in keys: k = to_text(key) v = jdata[k] if '/pull/' in v['html_url']: continue print(v['html_url']) # extract fields from the body td = extract_template_data( v['body'], issue_number=key, issue_class=None ) # schema tests assert isinstance(td, dict) assert 'component_raw' in td assert 'component name' in td # confirm the raw converted to the component name assert td['component_raw'] == v['component_raw'] assert td['component name'] == v['component_name'] # confirm module matching works. mm = AT.find_module_match(v['title'], td) if v['module_match']: if mm is None: import epdb; epdb.st() elif mm['filepath'] != v['module_match'] and \ mm['name'] != v['module_match']: import epdb; epdb.st() elif mm is not None: import epdb; epdb.st()
def strip_ansible_version(self, rawtext, logprefix=''): # any # all # all? # all ? # all recent releases # a55c6625d4771c44017fce1d487b38749b12b381 (latest dev) # ansible devel # devel # latest # latest devel branch # v2.0.0-0.9.rc4 # N/A # NA # current head # master # not applicable # >2.0 # - 1.8.2 # - devel head f9c203feb68e224cd3d445568b39293f8a3d32ad # ansible@devel # 1.x # 2.x devel = ['devel', 'master', 'head', 'latest', 'all', 'all?', 'all ?', 'any', 'n/a', 'na', 'not applicable', 'latest devel', 'latest devel branch', 'ansible devel', '', 'future', 'git version', 'ansible@devel', 'all recent releases'] if not self.VALIDVERSIONS: self._get_versions() if rawtext is None: return 'devel' aversion = False rawtext = rawtext.replace('`', '') rawtext = rawtext.strip() rawtext = rawtext.lower() rawlines = rawtext.split('\n') rawlines = [x.strip() for x in rawlines] # exit early for "devel" variations ... if rawtext in devel: return 'devel' # handle 1.x/2.x globs xver = re.compile('^-?[1-9].x') if len(rawlines) == 1: if xver.match(rawlines[0]): major_ver = rawlines[0].split('.')[0] # Get the highest minor version for this major cversions = reversed(sorted(self.VALIDVERSIONS.keys())) for cver in cversions: if cver[0] == major_ver: aversion = cver break if aversion: return aversion xver = re.compile('^-?[1-9].[1-9].x') if len(rawlines) == 1: if xver.match(rawlines[0]): major_ver = rawlines[0].split('.')[0] minor_ver = rawlines[0].split('.')[1] # Get the highest minor version for this major cversions = reversed(sorted(self.VALIDVERSIONS.keys())) for cver in cversions: if cver[0:3] == (major_ver + '.' + minor_ver): aversion = cver break if aversion: return aversion # check for copy/paste from --version output for idx, x in enumerate(rawlines): if len(rawlines) < (idx+2): continue if x.startswith('ansible') and \ (rawlines[idx+1].startswith('config file') or rawlines[idx+1].startswith('configured module search path')): parts = x.replace(')', '').split() aversion = parts[1] # is this a checkout with a hash? ... if len(parts) > 3: pass elif len(parts) > 2: # ['ansible', '2.2.0.0', 'rc1'] pass return aversion # try to find a vstring ... pidx = rawtext.find('.') if pidx > -1: fver = '' # get chars to the end of the vstring ... for char in rawtext[pidx:]: if char == ' ' or char == '\n' or char == '\r' \ or (not char.isalnum() and char != '.'): break else: fver += char head = rawtext[:pidx] head = head[::-1] # get chars to the beginning of the vstring ... for char in head: if char == ' ' or char == '\n' or char == '\r' \ or (not char.isalnum() and char != '.'): break else: fver = char + fver if fver[0] == 'v': fver = fver[1:] if fver: sver = None lver = None try: sver = StrictVersion(fver) except Exception: pass try: lver = LooseVersion(fver) except Exception: pass if sver: return fver elif lver and fver[0].isdigit(): return fver lines = rawtext.split('\n') lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if not x.startswith('config')] lines = [x for x in lines if not x.startswith('<')] lines = [x for x in lines if not x.startswith('-')] lines = [x for x in lines if not x.startswith('lib')] for idx, x in enumerate(lines): if "'" in x: x = x.replace("'", '').strip() if '"' in x: x = x.replace('"', '').strip() if '`' in x: x = x.replace('`', '').strip() if ',' in x: x = x.replace(',', '').strip() if '*' in x: x = x.replace('*', '').strip() if ')' in x: x = x.replace(')', '').strip() lines[idx] = x lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if x.startswith('ansible') or x[0].isdigit() or x[0] == 'v'] # https://github.com/ansible/ansible-modules-extras/issues/809 # false positives from this issue ... lines = [x for x in lines if 'versions: []' not in x] # try to narrow down to a single line if len(lines) > 1: candidate = None for x in lines: pidx = x.find('.') if pidx == -1: continue if (len(x) - 1) < (pidx+1): continue if not x[pidx+1].isdigit(): continue if (x.startswith('ansible') or x[0].isdigit()) and '.' in x: candidate = x break if candidate: lines = [candidate] if len(lines) > 0: try: StrictVersion(lines[0]) aversion = lines[0] except Exception as e: words = lines[0].split() words = [x.strip() for x in words if x.strip()] words = [x for x in words if x != 'stable'] words = [x for x in words if x != 'ansible'] words = [x for x in words if x != 'ansible-doc'] words = [x for x in words if x != 'ansible-playbook'] if not words: print(logprefix + "NO VERSIONABLE WORDS!!") pass else: if words[0].startswith('ansible-'): words[0] = words[0].replace('ansible-', '') if words[0][0] == 'v': words[0] = words[0][1:] characters = words[0].split('.') digits = [x.isdigit() for x in characters] digits = sorted(set(digits)) if digits == [True]: try: aversion = words[0] except Exception as e: logging.error(e) raise elif characters[0].isdigit(): aversion = words[0] else: print(logprefix + "INVALID VER STRING !!!") print(logprefix + 'Exception: ' + to_text(e)) for line in lines: print(logprefix + line) return aversion
def get_pullrequest_status(self, force_fetch=False): def sort_unique_statuses(statuses): '''reduce redundant statuses to the final run for each id''' result = [] groups = [] thisgroup = [] for idx, x in enumerate(statuses): if not thisgroup: thisgroup.append(x) if idx == len(statuses) - 1: groups.append(thisgroup) continue else: if thisgroup[-1][u'target_url'] == x[u'target_url']: thisgroup.append(x) else: groups.append(thisgroup) thisgroup = [] thisgroup.append(x) if idx == len(statuses) - 1: groups.append(thisgroup) for group in groups: group.sort(key=operator.itemgetter(u'updated_at')) result.append(group[-1]) return result fetched = False jdata = None pdata = None # pull out the status url from the raw data rd = self.pullrequest_raw_data surl = rd[u'statuses_url'] pfile = os.path.join(self.cachedir, u'issues', to_text(self.number), u'pr_status.pickle') pdir = os.path.dirname(pfile) if not os.path.isdir(pdir): os.makedirs(pdir) if os.path.isfile(pfile): logging.info(u'pullrequest_status load pfile') with open(pfile, 'rb') as f: pdata = pickle_load(f) if pdata: # is the data stale? if pdata[0] < self.pullrequest.updated_at or force_fetch: logging.info(u'fetching pr status: stale, previous from %s' % pdata[0]) jdata = self._fetch_api_url(surl) self.log_ci_status(jdata) fetched = True else: jdata = pdata[1] # missing? if not jdata: logging.info(u'fetching pr status: !data') jdata = self._fetch_api_url(surl) fetched = True if fetched or not os.path.isfile(pfile): logging.info(u'writing %s' % pfile) pdata = (self.pullrequest.updated_at, jdata) with open(pfile, 'wb') as f: pickle_dump(pdata, f) # remove intermediate duplicates #jdata = sort_unique_statuses(jdata) return jdata
def get_version_major_minor(self, vstring): '''Return an X.Y version''' lver = LooseVersion(vstring) rval = '.'.join([to_text(x) for x in lver.version[0:2]]) return rval
def extract_template_data(body, issue_number=None, issue_class='issue', sections=None, find_extras=True): if sections is None: sections = SECTIONS # pointless to parse a null body if not body: return {} # simple find or fuzzy find the sections within the body tdict = find_sections(body) or fuzzy_find_sections(body, sections) if not tdict: return {} # lowercase the keys ndict = {} for k, v in six.iteritems(tdict): ku = k.lower() if ku == u'plugin name': ku = u'component name' ndict[ku] = v if ndict != tdict: tdict = ndict.copy() # make a raw component section for later processing component_raw = tdict.get(u'component name', u'') # https://github.com/ansible/ansibullbot/issues/359 if u',' in tdict.get(u'component name', u''): tdict[u'component name'] = tdict[u'component name'].replace( u',', u'\n') # https://github.com/ansible/ansibullbot/issues/385 if u' and ' in tdict.get(u'component name', u''): tdict[u'component name'] = tdict[u'component name'].replace( u' and ', u'\n') # cleanup the sections for k, v in six.iteritems(tdict): # remove markdown comments from the sections v = remove_markdown_comments(v) # remove non-ascii chars v = to_text(to_bytes(v, 'ascii', errors='ignore'), 'ascii') # normalize newlines and return chars v = v.replace(u'\r', u'\n') # remove pre-ceding and trailing newlines v = v.strip() # remove trailing hashes while v.endswith(u'#'): v = v[:-1] # remove pre-ceding and trailing newlines (AGAIN) v = v.strip() # clean more on critical sections if u'step' not in k and u'result' not in k: # https://github.com/ansible/ansible-modules-extras/issues/2262 if k == u'component name': v = v.lower() if k == u'component name' and u'module' in v: if u'/modules/' in v or \ u'module_util' in v or \ u'module_utils/' in v or \ u'validate-modules' in v or\ u'module_common' in v: # https://github.com/ansible/ansible/issues/20563 # https://github.com/ansible/ansible/issues/18179 pass else: # some modules have the word "_module" in their name # https://github.com/ansible/ansibullbot/issues/198 # https://github.com/ansible/ansible-modules-core/issues/4159 # https://github.com/ansible/ansible-modules-core/issues/5328 reg = re.compile(u'\S+_module') match = reg.match(v) if match: v = v[match.pos:match.end()] else: # https://github.com/ansible/ansibullbot/issues/385 if u'modules' in v: v = v.replace(u'modules', u' ') else: v = v.replace(u'module', u' ') # remove useless chars v = clean_bad_characters(v) # clean up empty lines vlines = v.split(u'\n') vlines = [x for x in vlines if x.strip()] vlines = [x.strip() for x in vlines if x.strip()] v = u'\n'.join(vlines) # remove pre-ceding special chars for bc in [u'-', u'*']: if v: if v[0] == bc: v = v[1:] v = v.strip() # keep just the first line for types and components if k in [u'issue type', u'component name']: if v: vlines = v.split(u'\n') # https://github.com/ansible/ansible-modules-core/issues/3085 vlines = [x for x in vlines if u'pick one' not in x] v = vlines[0] # https://github.com/ansible/ansible-modules-core/issues/4060 if k in [u'issue type']: if u'/' in v: v = v.split(u'/') if k == [u'issue type']: v = v[0] else: v = v[-1] v = v.strip() if issue_class == u'issue': if k == u'issue type' and v != u'bug report' and u'bug' in v.lower( ): v = u'bug report' elif k == u'issue type' and v != u'feature idea' and u'feature' in v.lower( ): v = u'feature idea' elif issue_class == u'pullrequest': if k == u'issue type' and v != u'bugfix pull request' and u'bug' in v.lower( ): v = u'bugfix pull request' elif k == u'issue type' and v != u'feature pull request' and u'feature' in v.lower( ): v = u'feature pull request' elif k == u'issue type' and v != u'new module pull request' and u'new module' in v.lower( ): v = u'new module pull request' elif k == u'issue type' and v != u'docs pull request' and u'docs' in v.lower( ): v = u'docs pull request' elif k == u'issue type' and v != u'test pull request' and u'test' in v.lower( ): v = u'test pull request' # save tdict[k] = v # quick clean and add raw component to the dict component_raw = remove_markdown_comments(component_raw) component_raw = clean_bad_characters(component_raw, exclude=None) component_raw = u'\n'.join( [x.strip() for x in component_raw.split(u'\n') if x.strip()]) component_raw = u'\n'.join( [x for x in component_raw.split(u'\n') if not x.startswith(u'#')]) tdict[u'component_raw'] = component_raw return tdict
def get_last_shippable_full_run_date(ci_status, shippable): '''Map partial re-runs back to their last full run date''' # https://github.com/ansible/ansibullbot/issues/935 # (Epdb) pp [x['target_url'] for x in ci_status] # [u'https://app.shippable.com/github/ansible/ansible/runs/67039/summary', # u'https://app.shippable.com/github/ansible/ansible/runs/67039/summary', # u'https://app.shippable.com/github/ansible/ansible/runs/67039', # u'https://app.shippable.com/github/ansible/ansible/runs/67037/summary', # u'https://app.shippable.com/github/ansible/ansible/runs/67037/summary', # u'https://app.shippable.com/github/ansible/ansible/runs/67037'] if shippable is None: return None # extract and unique the run ids from the target urls runids = [get_runid_from_status(x) for x in ci_status] # get rid of duplicates and sort runids = sorted(set(runids)) # always use the numerically higher run id runid = runids[-1] # build a datastructure to hold the info collected rundata = { u'runid': runid, u'created_at': None, u'rerun_batch_id': None, u'rerun_batch_createdat': None } # query the api for all data on this runid try: rdata = shippable.get_run_data(to_text(runid), usecache=False) except ShippableNoData: return None # whoops ... if rdata is None: return None # get the referenced run for the last runid if it exists pbag = rdata.get(u'propertyBag') if pbag: rundata[u'rerun_batch_id'] = pbag.get(u'originalRunId') # keep the timestamp too rundata[u'created_at'] = rdata.get(u'createdAt') # if it had a rerunbatchid it was a partial run and # we need to go get the date on the original run while rundata[u'rerun_batch_id']: # the original run data rjdata = shippable.get_run_data(rundata[u'rerun_batch_id']) # swap the timestamp rundata[u'rerun_batch_createdat'] = rundata[u'created_at'] # get the old timestamp rundata[u'created_at'] = rjdata.get(u'createdAt') # get the new batchid #rundata['rerun_batch_id'] = rjdata.get('propertyBag', {}).get('originalRunId') pbag = rjdata.get(u'propertyBag') if pbag: rundata[u'rerun_batch_id'] = pbag.get(u'originalRunId') else: rundata[u'rerun_batch_id'] = None # return only the timestamp from the last full run return rundata[u'created_at']
def ansible_version_by_commit(self, commithash, config=None): # $ git branch --contains e620fed755a9c7e07df846b7deb32bbbf3164ac7 # * devel #$ git branch -r --contains 6d9949698bd6a5693ef64cfde845c029f0e02b91 | egrep -e 'release' -e 'stable' | head # origin/release1.5.0 # origin/release1.5.1 # origin/release1.5.2 # origin/release1.5.3 # origin/release1.5.4 # origin/release1.5.5 # origin/release1.6.0 # origin/release1.6.1 # origin/release1.6.10 # origin/release1.6.2 ''' # make sure the checkout cache is still valid self.update_checkout() ''' aversion = None if not self.COMMITVERSIONS: self.COMMITVERSIONS = {} if commithash in self.COMMITVERSIONS: aversion = self.COMMITVERSIONS[commithash] else: # get devel's version devel_version = self._get_devel_version() cmd = u'cd %s;' % self.checkoutdir cmd += u'git branch -r --contains %s' % commithash (rc, so, se) = run_command(cmd) lines = (x.strip() for x in to_text(so).split(u'\n')) lines = list(filter(bool, lines)) rlines = (x for x in lines if x.startswith((u'origin/release', u'origin/stable'))) rlines = (x.split(u'/')[-1] for x in rlines) rlines = (x.replace(u'release', u'') for x in rlines) rlines = [x.replace(u'stable-', u'') for x in rlines] if rc != 0: logging.error(u"rc != 0") if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'bad returncode') if len(rlines) > 0: aversion = rlines[0] else: if u'HEAD' in lines[0] or lines[0].endswith(u'/devel'): ''' cmd = 'cd %s;' % self.checkoutdir cmd += 'git branch -a | fgrep -e release -e stable | tail -n 1' (rc, so, se) = run_command(cmd) cver = so.strip() cver = cver.replace('remotes/origin/stable-', '') cver = cver.replace('remotes/upstream/stable-', '') cver = cver.replace('remotes/origin/release', '') cver = cver.replace('remotes/upstream/release', '') assert cver, "cver is null" assert cver[0].isdigit(), "cver[0] is not digit: %s" % cver aversion = cver ''' aversion = devel_version else: logging.error(u"WTF!? ...") if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'HEAD not found') self.COMMITVERSIONS[commithash] = aversion return aversion
def fuzzy_find_sections(body, sections): upper_body = body.upper() # make a map of locations where each section starts match_map = {} for section in sections: # http://www.tutorialspoint.com/python/string_find.htm # str.find(str, beg=0 end=len(string)) match = upper_body.find(section) if match != -1: match_map[section] = match if not match_map: return {} # what are the header(s) being used? headers = [] for k, v in match_map.items(): try: before = upper_body[v - 1] after = upper_body[v + len(k)] header = before + u'${section}' + after headers.append(header) except Exception as e: pass # pick the most common header and re-search with it if len(sorted(set(headers))) > 1: choices = sorted(set(headers)) choice_totals = [] for choice in choices: ctotal = len([x for x in headers if x == choice]) choice_totals.append((ctotal, choice)) choice_totals.sort(key=lambda tup: tup[0]) sheader = choice_totals[-1][1] match_map = {} t = Template(sheader) for section in sections: try: tofind = t.substitute(section=section) except Exception as e: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'substitution failed: %s' % to_text(e)) match = upper_body.find(tofind) if match != -1: match_map[section] = match + 1 # re-do for missing sections with less common header(s) for section in sections: if section in match_map: continue for choice in choices: t = Template(choice) tofind = t.substitute(section=section) match = upper_body.find(tofind) if match != -1: match_map[section] = match + 1 break elif len(headers) <= 1: if headers and \ (u'#' not in headers[0] and u':' not in headers[0] and u'*' not in headers[0]): return {} else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() # sort mapping by element id and inject itype if needed match_map = sorted(match_map.items(), key=operator.itemgetter(1)) if match_map and u'ISSUE TYPE' not in [x[0] for x in match_map]: if match_map[0][1] > 10: match_map.insert(0, (u'ISSUE TYPE', 0)) # extract the sections based on their indexes tdict = {} total_indexes = len(match_map) - 1 for idx, x in enumerate(match_map): if x[1] > 0: start_index = x[1] + (len(x[0])) else: start_index = 0 # if last index, slice to the end if idx >= total_indexes: tdict[x[0]] = body[start_index:] else: # slice to the next section stop_index = match_map[idx + 1][1] tdict[x[0]] = body[start_index:stop_index] return tdict
def process(self): """Merge all events into chronological order""" # FIXME - load this just once for later reference cache = self._load_cache() processed_events = [] events = self.issue.events comments = self.issue.comments reactions = self.issue.reactions processed_events = [] for ide, event in enumerate(events): if isinstance(event, dict): event = Event( event, id='%s_%s_%s' % (self.issue.repo_full_name, self.issue.number, ide)) cdict = self.get_event_from_cache(event.id, cache) if cdict: edict = cdict.copy() else: edict = {} edict[u'id'] = event.id if not hasattr(event.actor, u'login'): edict[u'actor'] = None else: edict[u'actor'] = event.actor.login edict[u'event'] = event.event edict[u'created_at'] = event.created_at if edict[u'event'] in [u'labeled', u'unlabeled']: raw_data = self._raw_data_from_event(event) edict[u'label'] = raw_data.get(u'label', {}).get(u'name', None) elif edict[u'event'] == u'mentioned': pass elif edict[u'event'] == u'subscribed': pass elif edict[u'event'] == u'referenced': edict[u'commit_id'] = event.commit_id elif edict[u'event'] == u'assigned': edict[u'assignee'] = event.raw_data[u'assignee'][u'login'] edict[u'assigner'] = event.raw_data[u'assigner'][u'login'] processed_events.append(edict) for comment in comments: edict = { u'id': comment.id, u'event': u'commented', u'actor': comment.user.login, u'created_at': comment.created_at, u'body': comment.body, } processed_events.append(edict) for reaction in reactions: # 2016-07-26T20:08:20Z if not isinstance(reaction, dict): # FIXME - not sure what's happening here pass else: edict = { u'id': reaction[u'id'], u'event': u'reacted', u'created_at': reaction[u'created_at'], u'actor': reaction[u'user'][u'login'], u'content': reaction[u'content'], } if isinstance(edict[u'created_at'], six.binary_type): edict[u'created_at'] = to_text(edict[u'created_at']) # convert the timestamp the same way the lib does it if isinstance(edict[u'created_at'], six.text_type): edict[u'created_at'] = self.parse_timestamp( edict[u'created_at']) processed_events.append(edict) # get rid of events with no created_at =( processed_events = [ x for x in processed_events if x.get(u'created_at') ] # sort by created_at sorted_events = sorted(processed_events, key=itemgetter(u'created_at')) # return ... return sorted_events
def __init__(self, issue, usecache=True, cachedir=None, exclude_users=[]): self.issue = issue self.maincache = cachedir self._waffled_labels = None if issue.repo.repo_path not in cachedir and u'issues' not in cachedir: self.cachefile = os.path.join(self.maincache, issue.repo.repo_path, u'issues', to_text(issue.instance.number), u'history.pickle') elif issue.repo.repo_path not in cachedir: self.cachefile = os.path.join(self.maincache, issue.repo.repo_path, u'issues', to_text(issue.instance.number), u'history.pickle') elif u'issues' not in cachedir: self.cachefile = os.path.join(self.maincache, u'issues', to_text(issue.instance.number), u'history.pickle') else: self.cachefile = os.path.join(self.maincache, to_text(issue.instance.number), u'history.pickle') self.cachedir = os.path.join(self.maincache, os.path.dirname(self.cachefile)) if u'issues' not in self.cachedir: logging.error(self.cachedir) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'') if not usecache: self.history = self.process() else: """Building history is expensive and slow""" cache = self._load_cache() if not cache: logging.info(u'empty history cache, rebuilding') self.history = self.process() logging.info(u'dumping newly created history cache') self._dump_cache() else: reprocess = False # use a versioned schema to track changes if not cache.get( 'version') or cache['version'] < self.SCHEMA_VERSION: reprocess = True if cache[u'updated_at'] < self.issue.instance.updated_at: reprocess = True if reprocess: logging.info(u'history out of date, updating') self.history = self.process() logging.info(u'dumping newly created history cache') self._dump_cache() logging.info(u'use cached history') self.history = cache[u'history'] if exclude_users: tmp_history = [x for x in self.history] for x in tmp_history: if x[u'actor'] in exclude_users: self.history.remove(x) self.fix_history_tz() self.history = self._fix_comments_with_no_body(self.history) self.history = self._fix_commits_with_no_message(self.history) self.history = sorted(self.history, key=itemgetter(u'created_at'))
def parse_yaml(data): def clean_list_items(inlist): if isinstance(inlist, list): inlist = to_text(inlist) inlist = inlist.replace("[", '') inlist = inlist.replace("]", '') inlist = inlist.replace("'", '') inlist = inlist.replace(",", '') inlist = inlist.split() return inlist def join_if_list(list_or_str): if not isinstance(list_or_str, list): return list_or_str return ' '.join(list_or_str) def fix_lists(data): string_macros = { k: join_if_list(v) for k, v in data['macros'].items() } for k, v in data['files'].items(): if v is None: continue for k2, v2 in v.items(): if isinstance(v2, str) and '$' in v2: tmpl = Template(v2) newv2 = tmpl.substitute(**string_macros) newv2 = clean_list_items(newv2) data['files'][k][k2] = newv2 v2 = newv2 if isinstance(v2, str): data['files'][k][k2] = v2.split() return data def fix_keys(data): replace = [] for k in data['files'].keys(): if '$' in k: replace.append(k) for x in replace: tmpl = Template(x) newkey = tmpl.substitute(**data['macros']) data['files'][newkey] = data['files'][x] data['files'].pop(x, None) paths = list(data['files'].keys()) for p in paths: normpath = os.path.normpath(p) if p != normpath: metadata = data['files'].pop(p) data['files'][normpath] = metadata return data def extend_labels(data): for k, v in data['files'].items(): # labels from path(s) if v is None: continue labels = v.get('labels', []) if isinstance(labels, str): labels = labels.split() labels = [x.strip() for x in labels if x.strip()] path_labels = [x.strip() for x in k.split('/') if x.strip()] for x in path_labels: x = x.replace('.py', '') x = x.replace('.ps1', '') if x not in labels: labels.append(x) data['files'][k]['labels'] = sorted(set(labels)) return data def fix_teams(data): for k, v in data['macros'].items(): if v is None: continue if not k.startswith('team_') or isinstance(v, list): continue names = v.split() data['macros'][k] = names return data def _propagate(files, top, child, field, multivalued=True): '''Copy key named 'field' from top to child - with multivalued, child inherits from all ancestors - else child inherits from the nearest ancestor and only if field is not already set at child level ''' top_entries = files[top].get(field, []) if top_entries: if field not in files[child]: files[child][field] = [] # track the origin of the data field_keys = '%s_keys' % field if field_keys not in files[child]: files[child][field_keys] = [] if multivalued: files[child][field_keys].append(top) for entry in top_entries: if entry not in files[child][field]: files[child][field].append(entry) elif not files[child][field] or ( files[child][field_keys] and len(files[child][field_keys][0]) < len(top)): # use parent keyword only if: # 1. either keyword is not set # 2. or keyword has been already inherited from a less specific path files[child][field_keys] = [top] files[child][field] = top_entries[:] def propagate_keys(data): '''maintainers and ignored keys defined at a directory level are copied to subpath''' files = data['files'] iterfiles = compute_file_children(files.keys()) for file1, files2 in iterfiles.items(): for file2 in files2: top = min(file1, file2) child = max(file1, file2) _propagate(files, top, child, 'maintainers') _propagate(files, top, child, 'ignored') _propagate(files, top, child, 'labels') _propagate(files, top, child, 'support', multivalued=False) _propagate(files, top, child, 'supported_by', multivalued=False) ################################# # PARSE ################################# # https://github.com/ansible/ansibullbot/issues/1155#issuecomment-457731630 logging.info('botmeta: load yaml') ydata_orig = yaml.load(data, BotYAMLLoader) ydata = yaml.load(yaml.dump(ydata_orig, Dumper=NoAliasDumper), BotYAMLLoader) # fix the team macros logging.info('botmeta: fix teams') ydata = fix_teams(ydata) # fix the macro'ized file keys logging.info('botmeta: fix keys') ydata = fix_keys(ydata) logging.info('botmeta: iterate files') for k, v in ydata['files'].items(): if v is None: # convert empty val in dict ydata['files'][k] = {} continue if isinstance(v, bytes): v = to_text(v) if isinstance(v, str): # convert string vals to a maintainers key in a dict ydata['files'][k] = {'maintainers': v} ydata['files'][k]['maintainers_keys'] = [k] # replace macros in files section logging.info('botmeta: fix lists') ydata = fix_lists(ydata) # extend labels by filepath logging.info('botmeta: extend labels') ydata = extend_labels(ydata) # key inheritance logging.info('botmeta: propogate keys') propagate_keys(ydata) return ydata
def get_issue_summaries(self, repo_url, baseurl=None, cachefile=None): '''Paginate through github's web interface and scrape summaries''' # repo_url - https://github.com/ansible/ansible for example # baseurl - an entrypoint for one-off utils to scrape specific issue # query urls. NOTE: this disables writing a cache # get cached if not baseurl: issues = self.load_summaries(repo_url) else: issues = {} if not baseurl: url = repo_url url += '/issues' url += '?' url += 'q=' url += urllib.parse.quote('sort:updated-desc') else: url = baseurl namespace = repo_url.split('/')[-2] reponame = repo_url.split('/')[-1] rr = self._request_url(url) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) if data['issues']: # send to receiver post_to_receiver('html_summaries', { 'user': namespace, 'repo': reponame }, data['issues']) # update master list issues.update(data['issues']) if not baseurl: self.dump_summaries_tmp(repo_url, issues) while data['next_page']: rr = self._request_url(self.baseurl + data['next_page']) soup = BeautifulSoup(rr.text, 'html.parser') data = self._parse_issue_summary_page(soup) # send to receiver post_to_receiver('html_summaries', { 'user': namespace, 'repo': reponame }, data['issues']) if not data['next_page'] or not data['issues']: break changed = [] changes = False for k, v in data['issues'].items(): if not isinstance(k, unicode): k = '%s' % k if k not in issues: changed.append(k) changes = True elif v != issues[k]: changed.append(k) changes = True issues[k] = v if changed: logging.info('changed: %s' % ','.join(x for x in changed)) if not baseurl: self.dump_summaries_tmp(repo_url, issues) if not changes: break # get missing if not baseurl: numbers = sorted([int(x) for x in issues.keys()]) missing = [x for x in xrange(1, numbers[-1]) if x not in numbers] for x in missing: summary = self.get_single_issue_summary(repo_url, x, force=True) if summary: post_to_receiver('html_summaries', { 'user': namespace, 'repo': reponame }, {x: summary}) if not isinstance(x, unicode): x = '%s' % x issues[x] = summary # get missing timestamps if not baseurl: numbers = sorted([int(x) for x in issues.keys()]) missing = [ x for x in numbers if to_text(x) not in issues or not issues[to_text(x)]['updated_at'] ] for x in missing: summary = self.get_single_issue_summary(repo_url, x, force=True) if summary: post_to_receiver('html_summaries', { 'user': namespace, 'repo': reponame }, {x: summary}) if not isinstance(x, unicode): x = '%s' % x issues[x] = summary # save the cache if not baseurl: self.dump_summaries(repo_url, issues) return issues
def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join(self.scraper_cache, k.replace(u'/', u'_') + u'.commits.pickle') if not os.path.isfile(pfile): refresh = True else: pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {} print(pfile) with open(pfile, 'rb') as f: pdata = pickle_load(f, **pickle_kwargs) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info(u'refresh commit cache for %s' % k) cmd = u'cd %s; git log --follow %s' % ( self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split(u'\n'): if line.startswith(u'commit '): commit = { u'name': None, u'email': None, u'login': None, u'hash': line.split()[-1], u'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith(u'Author: '): line = line.replace(u'Author: ', u'') line = line.replace(u'<', u'') line = line.replace(u'>', u'') lparts = line.split() if u'@' in lparts[-1]: commit[u'email'] = lparts[-1] commit[u'name'] = u' '.join(lparts[:-1]) else: pass if commit[u'email'] and \ u'noreply.github.com' in commit[u'email']: commit[u'login'] = commit[u'email'].split(u'@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith(u'Date:'): dstr = line.split(u':', 1)[1].strip() dstr = u' '.join(dstr.split(u' ')[:-1]) ds = datetime.datetime.strptime( to_text(dstr), u'%a %b %d %H:%M:%S %Y') commit[u'date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle_dump((mtime, self.commits[k]), f)
def clean_issue_cache(self, number): # https://github.com/ansible/ansibullbot/issues/610 cdir = os.path.join(self.cachedir, u'issues', to_text(number)) shutil.rmtree(cdir)
def get_template_data(iw): """Extract templated data from an issue body""" if iw.is_issue(): tfile = u'.github/ISSUE_TEMPLATE/bug_report.md' else: tfile = u'.github/PULL_REQUEST_TEMPLATE.md' # use the fileindexer whenever possible to conserve ratelimits if iw.gitrepo: tf_content = iw.gitrepo.get_file_content(tfile) else: try: tf = iw.repo.get_file_contents(tfile) tf_content = tf.decoded_content except Exception: logging.warning(u'repo does not have {}'.format(tfile)) tf_content = u'' # pull out the section names from the tempalte tf_sections = extract_template_sections(tf_content, header=TEMPLATE_HEADER) # what is required? iw._required_template_sections = \ [x.lower() for x in tf_sections.keys() if tf_sections[x][u'required']] # extract ... template_data = \ extract_template_data( iw.instance.body, issue_number=iw.number, issue_class=iw.github_type, sections=tf_sections.keys() ) # try comments if the description was insufficient if len(template_data.keys()) <= 2: s_comments = iw.history.get_user_comments(iw.submitter) for s_comment in s_comments: _template_data = extract_template_data(s_comment, issue_number=iw.number, issue_class=iw.github_type, sections=tf_sections.keys()) if _template_data: for k, v in _template_data.items(): if not v: continue if v and (k not in template_data or not template_data.get(k)): template_data[k] = v if u'ANSIBLE VERSION' in tf_sections and u'ansible version' not in template_data: # FIXME - abstract this into a historywrapper method vlabels = [x for x in iw.history.history if x[u'event'] == u'labeled'] vlabels = [ x for x in vlabels if x[u'actor'] not in [u'ansibot', u'ansibotdev'] ] vlabels = [ x[u'label'] for x in vlabels if x[u'label'].startswith(u'affects_') ] vlabels = [x for x in vlabels if x.startswith(u'affects_')] versions = [x.split(u'_')[1] for x in vlabels] versions = [float(x) for x in versions] if versions: version = versions[-1] template_data[u'ansible version'] = to_text(version) if u'COMPONENT NAME' in tf_sections and u'component name' not in template_data: if iw.is_pullrequest(): fns = iw.files if fns: template_data[u'component name'] = u'\n'.join(fns) template_data[u'component_raw'] = u'\n'.join(fns) else: clabels = [x for x in iw.labels if x.startswith(u'c:')] if clabels: fns = [] for clabel in clabels: clabel = clabel.replace(u'c:', u'') fns.append(u'lib/ansible/' + clabel) template_data[u'component name'] = u'\n'.join(fns) template_data[u'component_raw'] = u'\n'.join(fns) elif u'documentation' in template_data.get(u'issue type', u'').lower(): template_data[u'component name'] = u'docs' template_data[u'component_raw'] = u'docs' if u'ISSUE TYPE' in tf_sections and u'issue type' not in template_data: # FIXME - turn this into a real classifier based on work done in # jctanner/pr-triage repo. itype = None while not itype: for label in iw.labels: if label.startswith(u'bug'): itype = u'bug' break elif label.startswith(u'feature'): itype = u'feature' break elif label.startswith(u'doc'): itype = u'docs' break if itype: break if iw.is_pullrequest(): fns = iw.files for fn in fns: if fn.startswith(u'doc'): itype = u'docs' break if itype: break msgs = [iw.title, iw.body] if iw.is_pullrequest(): msgs += [ x[u'message'] for x in iw.history.history if x[u'event'] == u'committed' ] msgs = [x for x in msgs if x] msgs = [x.lower() for x in msgs] for msg in msgs: if u'fix' in msg: itype = u'bug' break if u'addresses' in msg: itype = u'bug' break if u'broke' in msg: itype = u'bug' break if u'add' in msg: itype = u'feature' break if u'should' in msg: itype = u'feature' break if u'please' in msg: itype = u'feature' break if u'feature' in msg: itype = u'feature' break # quit now break if itype and itype == u'bug' and iw.is_issue(): template_data[u'issue type'] = u'bug report' elif itype and itype == u'bug' and not iw.is_issue(): template_data[u'issue type'] = u'bugfix pullrequest' elif itype and itype == u'feature' and iw.is_issue(): template_data[u'issue type'] = u'feature idea' elif itype and itype == u'feature' and not iw.is_issue(): template_data[u'issue type'] = u'feature pullrequest' elif itype and itype == u'docs' and iw.is_issue(): template_data[u'issue type'] = u'documentation report' elif itype and itype == u'docs' and not iw.is_issue(): template_data[u'issue type'] = u'documenation pullrequest' return template_data
def parse_yaml(data): def clean_list_items(inlist): if isinstance(inlist, list): inlist = to_text(inlist) if u'&' in inlist: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() inlist = inlist.replace(u"[", u'') inlist = inlist.replace(u"]", u'') inlist = inlist.replace(u"'", u'') inlist = inlist.replace(u",", u'') inlist = inlist.split() return inlist def join_if_list(list_or_str): if not isinstance(list_or_str, list): return list_or_str return u' '.join(list_or_str) def fix_lists(data): string_macros = { k: join_if_list(v) for k, v in data[u'macros'].items() } for k, v in data[u'files'].items(): if v is None: continue for k2, v2 in v.items(): if isinstance(v2, six.text_type) and u'$' in v2: tmpl = Template(v2) newv2 = tmpl.substitute(**string_macros) newv2 = clean_list_items(newv2) data[u'files'][k][k2] = newv2 v2 = newv2 if isinstance(v2, six.text_type): data[u'files'][k][k2] = v2.split() return data def fix_keys(data): replace = [] for k in data[u'files'].keys(): if u'$' in k: replace.append(k) for x in replace: tmpl = Template(x) newkey = tmpl.substitute(**data[u'macros']) data[u'files'][newkey] = data[u'files'][x] data[u'files'].pop(x, None) paths = list(data[u'files'].keys()) for p in paths: normpath = os.path.normpath(p) if p != normpath: metadata = data[u'files'].pop(p) data[u'files'][normpath] = metadata return data def extend_labels(data): for k, v in data[u'files'].items(): # labels from path(s) if v is None: continue labels = v.get(u'labels', []) if isinstance(labels, six.text_type): labels = labels.split() labels = [x.strip() for x in labels if x.strip()] path_labels = [x.strip() for x in k.split(u'/') if x.strip()] for x in path_labels: x = x.replace(u'.py', u'') x = x.replace(u'.ps1', u'') if x not in labels: labels.append(x) data[u'files'][k][u'labels'] = sorted(set(labels)) return data def fix_teams(data): for k, v in data[u'macros'].items(): if v is None: continue if not k.startswith(u'team_') or isinstance(v, list): continue names = v.split() data[u'macros'][k] = names return data def _propagate(files, top, child, field, multivalued=True): '''Copy key named 'field' from top to child - with multivalued, child inherits from all ancestors - else child inherits from the nearest ancestor and only if field is not already set at child level ''' top_entries = files[top].get(field, []) if top_entries: if field not in files[child]: files[child][field] = [] # track the origin of the data field_keys = u'%s_keys' % field if field_keys not in files[child]: files[child][field_keys] = [] if multivalued: files[child][field_keys].append(top) for entry in top_entries: if entry not in files[child][field]: files[child][field].append(entry) elif not files[child][field] or (files[child][field_keys] and len(files[child][field_keys][0]) < len(top)): # use parent keyword only if: # 1. either keyword is not set # 2. or keyword has been already inherited from a less specific path files[child][field_keys] = [top] files[child][field] = top_entries[:] def propagate_keys(data): files = data[u'files'] '''maintainers and ignored keys defined at a directory level are copied to subpath''' for file1, file2 in itertools.combinations(files.keys(), 2): # Python 2.7 doesn't provide os.path.commonpath common = os.path.commonprefix([file1, file2]) top = min(file1, file2) child = max(file1, file2) top_components = top.split(u'/') child_components = child.split(u'/') if common == top and top_components == child_components[:len(top_components)]: _propagate(files, top, child, u'maintainers') _propagate(files, top, child, u'ignored') _propagate(files, top, child, u'labels') _propagate(files, top, child, u'support', multivalued=False) _propagate(files, top, child, u'supported_by', multivalued=False) ################################# # PARSE ################################# # https://github.com/ansible/ansibullbot/issues/1155#issuecomment-457731630 ydata_orig = yaml.load(data, BotYAMLLoader) ydata = yaml.load(yaml.dump(ydata_orig, Dumper=NoAliasDumper), BotYAMLLoader) # fix the team macros ydata = fix_teams(ydata) # fix the macro'ized file keys ydata = fix_keys(ydata) for k, v in ydata[u'files'].items(): if v is None: # convert empty val in dict ydata[u'files'][k] = {} continue if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, six.text_type): # convert string vals to a maintainers key in a dict ydata[u'files'][k] = { u'maintainers': v } ydata[u'files'][k][u'maintainers_keys'] = [k] # replace macros in files section ydata = fix_lists(ydata) # extend labels by filepath ydata = extend_labels(ydata) propagate_keys(ydata) return ydata
def get_summaries(self, owner, repo, otype='issues', last=None, first='first: 100', states='states: OPEN', paginate=True): """Collect all the summary data for issues or pullreuests Args: owner (str): the github namespace repo (str): the github repository otype (str): issues or pullRequests first (str): number of nodes per page, oldest to newest last (str): number of nodes per page, newest to oldest states (str): open or closed issues paginate (bool): recurse through page results """ templ = self.environment.from_string(QUERY_TEMPLATE) # after: "$endCursor" after = None ''' # first: 100 first = 'first: 100' # states: OPEN states = 'states: OPEN' ''' nodes = [] pagecount = 0 while True: logging.debug(u'%s/%s %s pagecount:%s nodecount: %s' % (owner, repo, otype, pagecount, len(nodes))) issueparams = u', '.join( [x for x in [states, first, last, after] if x]) query = templ.render(OWNER=owner, REPO=repo, OBJECT_TYPE=otype, OBJECT_PARAMS=issueparams, FIELDS=QUERY_FIELDS) payload = { #u'query': to_bytes(query, 'ascii', 'ignore').strip(), u'query': to_text(query, 'ascii', 'ignore').strip(), u'variables': u'{}', u'operationName': None } rr = requests.post(self.baseurl, headers=self.headers, data=json.dumps(payload)) if not rr.ok: break data = rr.json() if not data: break # keep each edge/node/issue for edge in data[u'data'][u'repository'][otype][u'edges']: node = edge[u'node'] self.update_node(node, otype.lower()[:-1], owner, repo) nodes.append(node) if not paginate: break pageinfo = data.get(u'data', {}).get(u'repository', {}).get(otype, {}).get(u'pageInfo') if not pageinfo: break if not pageinfo.get(u'hasNextPage'): break after = u'after: "%s"' % pageinfo[u'endCursor'] pagecount += 1 return nodes
def _collect_repo(self, repo, issuenums=None): '''Collect issues for an individual repo''' logging.info('getting repo obj for %s' % repo) if repo not in self.repos: gitrepo = GitRepoWrapper( cachedir=self.cachedir_base, repo=f'https://github.com/{repo}', commit=self.args.ansible_commit, ) self.repos[repo] = { 'repo': self.ghw.get_repo(repo), 'issues': [], 'processed': [], 'since': None, 'stale': [], 'loopcount': 0, 'labels': self.ghw.get_valid_labels(repo), 'gitrepo': gitrepo, } else: # force a clean repo object to limit caching problems logging.info('updating repo') self.repos[repo]['repo'] = self.ghw.get_repo(repo) logging.info('updating checkout') self.repos[repo]['gitrepo'].update() # clear the issues self.repos[repo]['issues'] = {} # increment the loopcount self.repos[repo]['loopcount'] += 1 logging.info('getting issue objs for %s' % repo) self.update_issue_summaries(repopath=repo, issuenums=issuenums) issuecache = {} numbers = self.issue_summaries[repo].keys() numbers = {int(x) for x in numbers} if issuenums: numbers.intersection_update(issuenums) numbers = list(numbers) logging.info('%s known numbers' % len(numbers)) if self.args.daemonize: if not self.repos[repo]['since']: ts = [ x[1]['updated_at'] for x in self.issue_summaries[repo].items() if x[1]['updated_at'] ] ts += [ x[1]['created_at'] for x in self.issue_summaries[repo].items() if x[1]['created_at'] ] ts = sorted(set(ts)) if ts: self.repos[repo]['since'] = ts[-1] else: since = strip_time_safely(self.repos[repo]['since']) api_since = self.repos[repo]['repo'].get_issues(since=since) numbers = [] for x in api_since: numbers.append(x.number) issuecache[x.number] = x numbers = sorted({int(n) for n in numbers}) logging.info('%s numbers after [api] since == %s' % (len(numbers), since)) for k, v in self.issue_summaries[repo].items(): if v['created_at'] is None: # issue is closed and was never processed continue if v['created_at'] > self.repos[repo]['since']: numbers.append(k) numbers = sorted({int(n) for n in numbers}) logging.info('%s numbers after [www] since == %s' % (len(numbers), since)) if self.args.start_at and self.repos[repo]['loopcount'] == 0: numbers = [x for x in numbers if x <= self.args.start_at] logging.info('%s numbers after start-at' % len(numbers)) # Get stale numbers if not targeting if self.args.daemonize and self.repos[repo]['loopcount'] > 0: logging.info('checking for stale numbers') stale = self.get_stale_numbers(repo) self.repos[repo]['stale'] = [int(x) for x in stale] numbers += [int(x) for x in stale] numbers = sorted(set(numbers)) logging.info('%s numbers after stale check' % len(numbers)) ################################################################ # PRE-FILTERING TO PREVENT EXCESSIVE API CALLS ################################################################ # filter just the open numbers if not self.args.only_closed and not self.args.ignore_state: numbers = [ x for x in numbers if (to_text(x) in self.issue_summaries[repo] and self.issue_summaries[repo][to_text(x)]['state'] == 'open') ] logging.info('%s numbers after checking state' % len(numbers)) # filter by type if self.args.only_issues: numbers = [ x for x in numbers if self.issue_summaries[repo][to_text(x)]['type'] == 'issue' ] logging.info('%s numbers after checking type' % len(numbers)) elif self.args.only_prs: numbers = [ x for x in numbers if self.issue_summaries[repo][to_text(x)] ['type'] == 'pullrequest' ] logging.info('%s numbers after checking type' % len(numbers)) numbers = sorted({int(x) for x in numbers}) if self.args.sort == 'desc': numbers = [x for x in reversed(numbers)] if self.args.last and len(numbers) > self.args.last: numbers = numbers[0 - self.args.last:] # Use iterator to avoid requesting all issues upfront self.repos[repo]['issues'] = RepoIssuesIterator( self.repos[repo]['repo'], numbers, issuecache=issuecache) logging.info('getting repo objs for %s complete' % repo)
def load_update_fetch(self, property_name, obj=None, force=False): '''Fetch a property for an issue object''' # A pygithub issue object has methods such as ... # - get_events() # - get_comments() # Those methods return a list with no update() property, # so we can't take advantage of the caching scheme used # for the issue it's self. Instead this function calls # those methods by their given name, and write the data # to a pickle file with a timestamp for the fetch time. # Upon later loading of the pickle, the timestamp is # compared to the issue's update_at timestamp and if the # pickle data is behind, the process will be repeated. edata = None events = [] updated = None update = False write_cache = False pfile = os.path.join(self.full_cachedir, u'%s.pickle' % property_name) pdir = os.path.dirname(pfile) logging.debug(pfile) if not os.path.isdir(pdir): os.makedirs(pdir) if os.path.isfile(pfile): try: with open(pfile, 'rb') as f: edata = pickle_load(f) except Exception as e: update = True write_cache = True # check the timestamp on the cache if edata: updated = edata[0] events = edata[1] if updated < self.instance.updated_at: update = True write_cache = True baseobj = None if obj: if obj == u'issue': baseobj = self.instance elif obj == u'pullrequest': baseobj = self.pullrequest else: if hasattr(self.instance, u'get_' + property_name): baseobj = self.instance else: if self.pullrequest: if hasattr(self.pullrequest, u'get_' + property_name): baseobj = self.pullrequest if not baseobj: logging.error( u'%s was not a property for the issue or the pullrequest' % property_name ) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(u'property error') # pull all events if timestamp is behind or no events cached if update or not events or force: write_cache = True updated = datetime.datetime.utcnow() if not hasattr(baseobj, u'get_' + property_name) \ and hasattr(baseobj, property_name): # !callable properties try: methodToCall = getattr(baseobj, property_name) except Exception as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(to_text(e)) events = methodToCall else: # callable properties try: methodToCall = getattr(baseobj, u'get_' + property_name) except Exception as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(to_text(e)) events = [x for x in methodToCall()] if C.DEFAULT_PICKLE_ISSUES: if write_cache or not os.path.isfile(pfile) or force: # need to dump the pickle back to disk edata = [updated, events] with open(pfile, 'wb') as f: pickle_dump(edata, f) return events
def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = b'' inphase = False with io.open(module_file, 'rb') as f: for line in f: if b'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith((b"'''", b'"""')): break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = u'' doc_lines = to_text(documentation).split(u'\n') for idx, x in enumerate(doc_lines): if x.startswith(u'author'): inphase = True if inphase and not x.strip().startswith((u'-', u'author')): inphase = False break if inphase: author_lines += x + u'\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines, BotYAMLLoader) except Exception as e: print(e) return [] # quit early if the yaml was not valid if not ydata: return [] # quit if the key was not found if u'author' not in ydata: return [] if not isinstance(ydata[u'author'], list): ydata[u'author'] = [ydata[u'author']] authors = [] for author in ydata[u'author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors
def scrape_pullrequest_review(self, repo_path, number): reviews = {'users': {}, 'reviews': {}} url = self.baseurl url += '/' url += repo_path url += '/pull/' url += to_text(number) rr = self._request_url(url) soup = BeautifulSoup(rr.text, 'html.parser') # <span class="reviewers-status-icon tooltipped tooltipped-nw # float-right d-block text-center" aria-label="nerzhul requested # changes"> spans = soup.findAll( 'span', {'class': lambda L: L and 'reviewers-status-icon' in L}) for span in spans: # nerzhul requested changes # bcoca left review comments # gundalow approved these changes # requested review from gundalow txt = span.attrs['aria-label'] tparts = txt.split(None, 1) if not tparts[0].lower() == 'awaiting': reviews['users'][tparts[0]] = tparts[1] # <div class="discussion-item discussion-item-review_requested"> # <div id="pullrequestreview-15502866" class="timeline-comment # js-comment"> rdivs = soup.findAll( 'div', {'class': lambda L: L and 'discussion-item-review' in L}) count = 0 for rdiv in rdivs: count += 1 author = rdiv.find('a', {'class': ['author']}).text id_div = rdiv.find( 'div', {'id': lambda L: L and L.startswith('pullrequestreview-')}) if id_div: rid = id_div.attrs['id'] else: rid = count tdiv = rdiv.find('relative-time') if tdiv: timestamp = tdiv['datetime'] else: timestamp = None obutton = rdiv.findAll( 'button', {'class': lambda L: L and 'outdated-comment-label' in L}) if obutton: outdated = True else: outdated = False reviewer = None # https://github.com/ansible/ansibullbot/issues/523 adiv = rdiv.find('div', { 'class': lambda L: L and L.startswith('discussion-item-header') }) if not adiv: adiv = rdiv.find('div', {'class': 'discussion-item'}) if not adiv: adiv = rdiv.find( 'h3', { 'class': lambda L: L and L.startswith( 'discussion-item-header') }) atxt = adiv.text atxt = atxt.lower() if 'suggested changes' in atxt: action = 'suggested changes' elif 'requested changes' in atxt: action = 'requested changes' elif 'self-requested a review' in atxt: # <a href="/resmo" class="author">resmo</a> action = 'requested review' ra = rdiv.find('a', {'class': 'author'}) if ra: reviewer = ra.text.strip() elif 'requested a review' in atxt: action = 'requested review' tparts = atxt.split() findex = tparts.index('from') reviewer = tparts[findex + 1] elif 'requested review' in atxt: action = 'requested review' tparts = atxt.split() findex = tparts.index('from') reviewer = tparts[findex + 1] elif 'approved these changes' in atxt: action = 'approved' elif 'left review comments' in atxt: action = 'review comment' elif 'reviewed' in atxt: action = 'reviewed' elif 'dismissed' in atxt: action = 'dismissed' elif 'removed ' in atxt: action = 'removed' tparts = atxt.split() if 'from' in tparts: findex = tparts.index('from') reviewer = tparts[findex + 1] else: raise Exception('parsing error on %s' % atxt) reviews['reviews'][rid] = { 'actor': author, 'action': action, 'reviewer': reviewer, 'timestamp': timestamp, 'outdated': outdated } # force to ascii x = {} for k, v in reviews['users'].items(): k = k.encode('ascii', 'ignore') v = v.encode('ascii', 'ignore') x[k] = v reviews['users'] = x.copy() return reviews
def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [ x for x in known_modules if title.startswith(x + u' ') ] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [ x for x in known_modules if fnmatch.fnmatch(x, component) ] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [ x for x in cmatches if x in title_matches and x not in [u'at'] ] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match
def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join(self.scraper_cache, k.replace('/', '_') + '.commits.pickle') if not os.path.isfile(pfile): refresh = True else: print(pfile) with open(pfile, 'rb') as f: pdata = pickle.load(f) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info('refresh commit cache for %s' % k) cmd = 'cd %s; git log --follow %s' % (self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split('\n'): if line.startswith('commit '): commit = { 'name': None, 'email': None, 'login': None, 'hash': line.split()[-1], 'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith('Author: '): line = line.replace('Author: ', '') line = line.replace('<', '') line = line.replace('>', '') lparts = line.split() if '@' in lparts[-1]: commit['email'] = lparts[-1] commit['name'] = ' '.join(lparts[:-1]) else: pass if commit['email'] and \ 'noreply.github.com' in commit['email']: commit['login'] = commit['email'].split('@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith('Date:'): dstr = line.split(':', 1)[1].strip() dstr = ' '.join(dstr.split(' ')[:-1]) commit['date'] = strip_time_safely(to_text(dstr)) self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle.dump((mtime, self.commits[k]), f)