def __init__(self, commits=True, blames=True, botmeta=None, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile if botmeta: self.botmeta = botmeta else: self.botmeta = { } # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper( cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True)
def main(): if len(sys.argv) != 2: usage() sys.exit(1) ec = EmailCache() cm = AnsibleComponentMatcher(gitrepo=GitRepoWrapper(cachedir=CACHEDIR, repo=REPO), botmetafile=None, cachedir=CACHEDIR, email_cache=ec) cm.update() # This gets the rendered meta for just modules ... #meta = cm.BOTMETA #print(json.dumps(cm.BOTMETA, indent=2, sort_keys=True)) # This is how the bot gets full meta for a file ... FULLMETA = {} for filen in cm.gitrepo.files: FULLMETA[filen] = cm.get_meta_for_file(filen) with open(sys.argv[1], 'w') as f: f.write(json.dumps(FULLMETA, indent=2, sort_keys=True))
def get_repo_for_collection(self, fqcn): today = datetime.datetime.now() if fqcn not in self._gitrepos: # reduce the number of requests ... try: rurl = self._checkout_index.get(fqcn, {}).get('url') except AttributeError as e: print(e) import epdb epdb.st() if rurl is None: # https://galaxy.ansible.com/api/v2/collections/devoperate/base/ curl = self._baseurl + '/api/v2/collections/' + fqcn.replace( '.', '/') + '/' rr = requests.get(curl) jdata = rr.json() vurl = jdata['latest_version']['href'] rr2 = requests.get(vurl) jdata2 = rr2.json() rurl = jdata2.get('metadata', {}).get('repository') # reduce the number of clones and rebases ... needs_rebase = False if fqcn not in self._checkout_index: needs_rebase = True elif not self._checkout_index.get(fqcn, {}).get('checkout'): needs_rebase = True elif not self._checkout_index.get(fqcn, {}).get('updated'): needs_rebase = True elif (today - self._checkout_index[fqcn]['updated']).days > 0: needs_rebase = True logging.info('checkout %s -> %s' % (fqcn, rurl)) grepo = GitRepoWrapper(cachedir=self.cachedir, repo=rurl, rebase=needs_rebase) self._gitrepos[fqcn] = grepo # keep the last updated time if not rebased ... if needs_rebase: updated = datetime.datetime.now() else: updated = self._checkout_index[fqcn]['updated'] self._checkout_index[fqcn] = { 'url': rurl, 'fqcn': fqcn, 'checkout': grepo.checkoutdir, 'updated': updated } self._save_checkout_index() return self._gitrepos[fqcn]
def __init__(self): self.cachedir = '/tmp/ansibot.cache' self.gitrepo = GitRepoWrapper( cachedir=self.cachedir, repo='https://github.com/ansible/ansible', commit='a76d78f6919f62698341be2f102297a2ce30897c') self.component_matcher = AnsibleComponentMatcher( usecache=True, gitrepo=self.gitrepo, cachedir='/tmp/ansibot.cache.components', email_cache={})
def __init__(self, gitrepo=None, botmetafile=None, cachedir=None, email_cache=None, file_indexer=None): self.cachedir = cachedir self.botmetafile = botmetafile self.email_cache = email_cache if file_indexer: self.file_indexer = file_indexer else: self.file_indexer = FileIndexer( botmetafile=self.botmetafile, checkoutdir=self.cachedir ) if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=self.REPO) self.strategy = None self.strategies = [] self.indexed_at = False self.updated_at = None self.update()
def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = {} # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True)
class ModuleIndexer(object): EMPTY_MODULE = { u'authors': [], u'name': None, u'namespaced_module': None, u'namespace_maintainers': [], u'deprecated': False, u'deprecated_filename': None, u'dirpath': None, u'filename': None, u'filepath': None, u'fulltopic': None, u'maintainers': [], u'_maintainers': [], u'maintainers_keys': None, u'metadata': {}, u'repo_filename': None, u'repository': u'ansible', u'subtopic': None, u'topic': None, u'imports': [] } def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = { } # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper( cachedir=cachedir, repo=u'https://github.com/ansible-collections/community.general' ) # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True) def update(self, force=False): '''Reload everything if there are new commits''' changed = self.gitrepo.manage_checkout() if changed or force: self.get_files() self.parse_metadata() def get_files(self): '''Cache a list of filenames in the checkout''' cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self.files = files def parse_metadata(self): if self.botmetafile is not None: with open(self.botmetafile, 'rb') as f: rdata = f.read() else: fp = u'.github/BOTMETA.yml' rdata = self.get_file_content(fp) self.botmeta = BotMetadataParser.parse_yaml(rdata) # load the modules logging.info(u'loading modules') self.get_ansible_modules() def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern, 'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format( k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format( res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches def find_match(self, pattern, exact=False): '''Exact module name matching''' logging.debug(u'find_match for "{}"'.format(pattern)) BLACKLIST = [ u'module_utils', u'callback', u'network modules', u'networking modules' u'windows modules' ] if not pattern or pattern is None: return None if pattern.lower() == u'core': return None ''' if 'docs.ansible.com' in pattern and '_module.html' in pattern: # http://docs.ansible.com/ansible/latest/copy_module.html # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html # http://docs.ansible.com/ansible/latest/postgresql_db_module.html # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html) # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', pattern ) #urls = [x for x in urls if '_module.html' in x] #if urls: # import epdb; epdb.st() import epdb; epdb.st() ''' # https://github.com/ansible/ansible/issues/19755 if pattern == u'setup': pattern = u'system/setup.py' if u'/facts.py' in pattern or u' facts.py' in pattern: pattern = u'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if u'-' in pattern: pattern = pattern.replace(u'-', u'_') if u'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif u'callback' in pattern: return None elif u'lookup' in pattern: return None elif u'contrib' in pattern and u'inventory' in pattern: return None elif pattern.lower() in BLACKLIST: return None elif u'/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 # FIXME what's this for? if not pattern.startswith(u'plugins/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + u'.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if isinstance(candidate, list): if len(candidate) == 1: candidate = candidate[0] if candidate[u'filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match(u'_' + bname) # unique the results if isinstance(match, list) and len(match) > 1: _match = [] for m in match: if m not in _match: _match.append(m) match = _match[:] return match def is_valid(self, mname): match = self.find_match(mname, exact=True) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname, exact=True) if match: return match[u'repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" matches = [] module_dir = os.path.join(self.gitrepo.checkoutdir, u'plugins/modules') module_dir = os.path.expanduser(module_dir) for root, _, filenames in os.walk(module_dir): for filename in filenames: if u'plugins/modules' in root and not filename == u'__init__.py': matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) self.populate_modules(matches) # custom fixes newitems = [] for k, v in six.iteritems(self.modules): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include.py'): self.modules[k][u'repository'] = u'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include_vars.py'): self.modules[k][u'repository'] = u'ansible' if k.endswith(u'/include_role.py'): self.modules[k][u'repository'] = u'ansible' # ansible maintains these if u'include' in k: self.modules[k][u'maintainers'] = [u'ansible'] # deprecated modules are annoying if v[u'name'].startswith(u'_'): dkey = os.path.dirname(v[u'filepath']) dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1)) if dkey not in self.modules: nd = v.copy() nd[u'name'] = nd[u'name'].replace(u'_', u'', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata logging.debug(u'set module metadata') self.set_module_metadata() # parse imports logging.debug(u'set module imports') self.set_module_imports() # last modified if self.get_commits: logging.debug(u'set module commits') self.get_module_commits() # parse blame if self.get_blames and self.get_commits: logging.debug(u'set module blames') self.get_module_blames() # depends on metadata now ... logging.debug(u'set module maintainers') self.set_maintainers() return self.modules def populate_modules(self, matches): # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict[u'filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'dirpath'] = dirpath filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'filepath'] = filepath mdict.update(self.split_topics_from_path(filepath)) mdict[u'repo_filename'] = mdict[u'filepath']\ .replace(u'plugins/modules/%s/' % mdict[u'repository'], u'') # clustering/consul mdict[u'namespaced_module'] = mdict[u'repo_filename'] mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.py', u'') mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.ps1', u'') mname = os.path.basename(match) mname = mname.replace(u'.py', u'') mname = mname.replace(u'.ps1', u'') mdict[u'name'] = mname # deprecated modules if mname.startswith(u'_'): mdict[u'deprecated'] = True deprecated_filename = \ os.path.dirname(mdict[u'namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + u'.py') mdict[u'deprecated_filename'] = deprecated_filename else: mdict[u'deprecated_filename'] = mdict[u'repo_filename'] self.modules[filepath] = mdict # meta is a special module self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules[u'meta'][u'name'] = u'meta' self.modules[u'meta'][u'repo_filename'] = u'meta' def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join(self.scraper_cache, k.replace(u'/', u'_') + u'.commits.pickle') if not os.path.isfile(pfile): refresh = True else: pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {} print(pfile) with open(pfile, 'rb') as f: pdata = pickle_load(f, **pickle_kwargs) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info(u'refresh commit cache for %s' % k) cmd = u'cd %s; git log --follow %s' % ( self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split(u'\n'): if line.startswith(u'commit '): commit = { u'name': None, u'email': None, u'login': None, u'hash': line.split()[-1], u'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith(u'Author: '): line = line.replace(u'Author: ', u'') line = line.replace(u'<', u'') line = line.replace(u'>', u'') lparts = line.split() if u'@' in lparts[-1]: commit[u'email'] = lparts[-1] commit[u'name'] = u' '.join(lparts[:-1]) else: pass if commit[u'email'] and \ u'noreply.github.com' in commit[u'email']: commit[u'login'] = commit[u'email'].split(u'@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith(u'Date:'): dstr = line.split(u':', 1)[1].strip() dstr = u' '.join(dstr.split(u' ')[:-1]) ds = datetime.datetime.strptime( to_text(dstr), u'%a %b %d %H:%M:%S %Y') commit[u'date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle_dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): if filepath in self.commits and u'hash' in self.commits[filepath][0]: return self.commits[filepath][0][u'hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.gitrepo.checkoutdir, filepath) (rc, so, se) = run_command(cmd) return to_text(so).strip() def get_module_blames(self): logging.debug(u'build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) logging.debug(u'build blame cache') blame_cache = self.session.query(Blame).all() blame_cache = [x.file_commit for x in blame_cache] blame_cache = sorted(set(blame_cache)) logging.debug(u'eval module hashes') changed = False keys = sorted(self.modules.keys()) for k in keys: if k not in self.files: self.committers[k] = {} continue ghash = self.last_commit_for_file(k) if ghash in blame_cache: continue logging.debug(u'checking hash for {}'.format(k)) res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all() hashes = [x.file_commit for x in res] if ghash not in hashes: logging.debug( u'hash {} not found for {}, updating blames'.format( ghash, k)) scraper_args = [u'ansible', u'ansible', u'devel', k] uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *scraper_args) # check the emails for email, login in emailmap.items(): if email in self.emails_cache: continue exists = self.session.query(Email).filter_by( email=email).first() if not exists: logging.debug(u'insert {}:{}'.format(login, email)) _email = Email(email=email, login=login) self.session.add(_email) changed = True # check the blames for login, commits in uns.items(): for commit in commits: kwargs = { u'file_name': k, u'file_commit': ghash, u'author_commit': commit, u'author_login': login } exists = self.session.query(Blame).filter_by( **kwargs).first() if not exists: logging.debug(u'insert {}:{}:{}'.format( k, commit, login)) _blame = Blame(**kwargs) self.session.add(_blame) changed = True if changed: self.session.commit() logging.debug(u're-build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) # fill in what we can ... logging.debug(u'fill in commit logins') for k in keys: for idc, commit in enumerate(self.commits[k][:]): if not commit.get(u'login'): continue login = self.emails_cache.get(commit[u'email']) if not login and u'@users.noreply.github.com' in commit[ u'email']: login = commit[u'email'].split(u'@')[0] self.emails_cache[commit[u'email']] = login if not login: print(u'unknown: {}'.format(commit[u'email'])) self.commits[k][idc][u'login'] = self.emails_cache.get(login) def get_emails_by_login(self, login): res = self.session.query(Email).filter_by(login=login) emails = [x.email for x in res.values()] return emails def _get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data for k in keys: cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue ghash = self.last_commit_for_file(k) pfile = os.path.join(self.scraper_cache, k.replace(u'/', u'_') + u'.blame.pickle') sargs = [u'ansible', u'ansible', u'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: logging.debug(u'load {}'.format(pfile)) with open(pfile, 'rb') as f: pdata = pickle_load(f) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() if pdata[0] == ghash: self.committers[k] = pdata[1] if len(pdata) == 3: # use emailmap if available emailmap = pdata[2] else: emailmap = {} else: refresh = True if refresh: if self.gqlc: logging.debug(u'graphql blame usernames {}'.format(pfile)) uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *sargs) else: emailmap = {} # scrapping: emails not available logging.debug(u'www blame usernames {}'.format(pfile)) uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle_dump((ghash, uns, emailmap), f) for email, github_id in emailmap.items(): if email not in self.emails_cache: self.emails_cache[email] = github_id # add scraped logins to the map for k in keys: for idx, x in enumerate(self.commits[k]): if x[u'email'] in [u'@']: continue if x[u'email'] not in self.emails_cache: self.emails_cache[x[u'email']] = None if x[u'login']: self.emails_cache[x[u'email']] = x[u'login'] continue xhash = x[u'hash'] for ck, cv in six.iteritems(self.committers[k]): if xhash in cv: self.emails_cache[x[u'email']] = ck break # fill in what we can ... for k in keys: for idx, x in enumerate(self.commits[k]): if not x[u'login']: if x[u'email'] in [u'@']: continue if self.emails_cache[x[u'email']]: login = self.emails_cache[x[u'email']] xhash = x[u'hash'] self.commits[k][idx][u'login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' # grep the authors: for k, v in six.iteritems(self.modules): if v[u'filepath'] is None: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) authors = self.get_module_authors(mfile) self.modules[k][u'authors'] = authors # authors are maintainers by -default- self.modules[k][u'maintainers'] += authors self.modules[k][u'maintainers'] = \ sorted(set(self.modules[k][u'maintainers'])) metadata = self.botmeta[u'files'].keys() for k, v in six.iteritems(self.modules): if k == u'meta': continue if k in self.botmeta[u'files']: # There are metadata in .github/BOTMETA.yml for this file # copy maintainers_keys self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][ k][u'maintainers_keys'][:] if self.botmeta[u'files'][k]: maintainers = self.botmeta[u'files'][k].get( u'maintainers', []) for maintainer in maintainers: if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored if u'ignored' in self.botmeta[u'files'][k]: ignored = self.botmeta[u'files'][k][u'ignored'] for x in ignored: if x in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(x) else: # There isn't metadata in .github/BOTMETA.yml for this file best_match = None for mkey in metadata: if v[u'filepath'].startswith(mkey): if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k][u'maintainers_keys'] = [best_match] for maintainer in self.botmeta[u'files'][best_match].get( u'maintainers', []): if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored for ignored in self.botmeta[u'files'][best_match].get( u'ignored', []): if ignored in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(ignored) # save a pristine copy so that higher level code can still use it self.modules[k][u'maintainers'] = sorted( set(self.modules[k][u'maintainers'])) self.modules[k][u'_maintainers'] = \ [x for x in self.modules[k][u'maintainers']] # set the namespace maintainers ... for k, v in six.iteritems(self.modules): if u'namespace_maintainers' not in self.modules[k]: self.modules[k][u'namespace_maintainers'] = [] if v.get(u'namespace'): ns = v.get(u'namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k][u'namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace(u'plugins/modules/', u'') path_parts = subpath.split(u'/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = u'/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { u'fulltopic': fulltopic, u'namespace': fulltopic, u'topic': topic, u'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = b'' inphase = False with io.open(module_file, 'rb') as f: for line in f: if b'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith((b"'''", b'"""')): break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = u'' doc_lines = to_text(documentation).split(u'\n') for idx, x in enumerate(doc_lines): if x.startswith(u'author'): inphase = True if inphase and not x.strip().startswith((u'-', u'author')): inphase = False break if inphase: author_lines += x + u'\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines, BotYAMLLoader) except Exception as e: print(e) return [] # quit early if the yaml was not valid if not ydata: return [] # quit if the key was not found if u'author' not in ydata: return [] if not isinstance(ydata[u'author'], list): ydata[u'author'] = [ydata[u'author']] authors = [] for author in ydata[u'author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors def extract_github_id(self, author): authors = set() if author is None: return [] if u'ansible core team' in author.lower(): authors.add(u'ansible') elif u'@' in author: # match github ids but not emails authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author)) elif u'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find(u'github.com/') author = author[idx + 11:] authors.add(author.replace(u')', u'')) elif u'(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find(u'(') author = author[idx + 1:] authors.add(author.replace(u')', u'')) # search for emails for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author): github_id = self.emails_cache.get(email) if github_id: authors.add(github_id) return list(authors) def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [ x for x in known_modules if title.startswith(x + u' ') ] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [ x for x in known_modules if fnmatch.fnmatch(x, component) ] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [ x for x in cmatches if x in title_matches and x not in [u'at'] ] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split(u'\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith(u'*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split(u'\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith(u'*'): thiskey = line.replace(u'*', u'') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) if not mfile.endswith(u'.py'): # metadata is only the .py files ... ext = mfile.split(u'.')[-1] mfile = mfile.replace(u'.' + ext, u'.py', 1) self.modules[k][u'metadata'].update( self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = u'' inphase = False with io.open(module_file, 'r', encoding='utf-8') as f: for line in f: if line.startswith(u'ANSIBLE_METADATA'): inphase = True if line.startswith(u'DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) tmp_meta = {} for k, v in meta.items(): if isinstance(k, six.binary_type): k = to_text(k) if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, list): tmp_list = [] for i in v: if isinstance(i, six.binary_type): i = to_text(i) tmp_list.append(i) v = tmp_list del tmp_list tmp_meta[k] = v meta = tmp_meta del tmp_meta except SyntaxError: pass return meta def set_module_imports(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) self.modules[k][u'imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): mimports = [] if not os.path.isfile(module_file): return mimports else: with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(b',', b'') if line.startswith(b'import') or \ (b'import' in line and b'from' in line): lparts = line.split() if line.startswith(b'import '): mimports.append(lparts[1]) elif line.startswith(b'from '): mpath = lparts[1] + b'.' for spath in lparts[3:]: mimports.append(mpath + spath) return [to_text(m) for m in mimports] @property def all_maintainers(self): maintainers = set() for path, metadata in self.botmeta[u'files'].items(): maintainers.update(metadata.get(u'maintainers', [])) return maintainers @property def all_authors(self): authors = set() for key, metadata in self.modules.items(): authors.update(metadata.get(u'authors', [])) return authors def get_maintainers_for_namespace(self, namespace): maintainers = [] for k, v in self.modules.items(): if u'namespace' not in v or u'maintainers' not in v: continue if v[u'namespace'] == namespace: for m in v[u'maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != u'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist def get_file_content(self, filepath): fpath = os.path.join(self.gitrepo.checkoutdir, filepath) if not os.path.isfile(fpath): return None with io.open(fpath, 'r', encoding='utf-8') as f: data = f.read() return data
class AnsibleComponentMatcher(object): BOTMETA = {} INDEX = {} REPO = 'https://github.com/ansible/ansible' STOPWORDS = ['ansible', 'core', 'plugin'] STOPCHARS = ['"', "'", '(', ')', '?', '*', '`', ',', ':', '?', '-'] BLACKLIST = ['new module', 'new modules'] FILE_NAMES = [] MODULES = {} MODULE_NAMES = [] MODULE_NAMESPACE_DIRECTORIES = [] # FIXME: THESE NEED TO GO INTO BOTMETA # ALSO SEE search_by_regex_generic ... KEYWORDS = { 'all': None, 'ansiballz': 'lib/ansible/executor/module_common.py', 'ansible-console': 'lib/ansible/cli/console.py', 'ansible-galaxy': 'lib/ansible/galaxy', 'ansible-inventory': 'lib/ansible/cli/inventory.py', 'ansible-playbook': 'lib/ansible/playbook', 'ansible playbook': 'lib/ansible/playbook', 'ansible playbooks': 'lib/ansible/playbook', 'ansible-pull': 'lib/ansible/cli/pull.py', 'ansible-vault': 'lib/ansible/parsing/vault', 'ansible-vault edit': 'lib/ansible/parsing/vault', 'ansible-vault show': 'lib/ansible/parsing/vault', 'ansible-vault decrypt': 'lib/ansible/parsing/vault', 'ansible-vault encrypt': 'lib/ansible/parsing/vault', 'async': 'lib/ansible/modules/utilities/logic/async_wrapper.py', 'become': 'lib/ansible/playbook/become.py', 'block': 'lib/ansible/playbook/block.py', 'blocks': 'lib/ansible/playbook/block.py', 'callback plugin': 'lib/ansible/plugins/callback', 'callback plugins': 'lib/ansible/plugins/callback', 'conditional': 'lib/ansible/playbook/conditional.py', 'docs': 'docs', 'delegate_to': 'lib/ansible/playbook/task.py', 'facts': 'lib/ansible/module_utils/facts', 'galaxy': 'lib/ansible/galaxy', 'groupvars': 'lib/ansible/vars/hostvars.py', 'group vars': 'lib/ansible/vars/hostvars.py', 'handlers': 'lib/ansible/playbook/handler.py', 'hostvars': 'lib/ansible/vars/hostvars.py', 'host vars': 'lib/ansible/vars/hostvars.py', 'integration tests': 'test/integration', 'inventory script': 'contrib/inventory', 'jinja2 template system': 'lib/ansible/template', 'module_utils': 'lib/ansible/module_utils', 'multiple modules': None, 'new module(s) request': None, 'new modules request': None, 'new module request': None, 'new module': None, 'network_cli': 'lib/ansible/plugins/connection/network_cli.py', 'network_cli.py': 'lib/ansible/plugins/connection/network_cli.py', 'network modules': 'lib/ansible/modules/network', 'paramiko': 'lib/ansible/plugins/connection/paramiko_ssh.py', 'role': 'lib/ansible/playbook/role', 'roles': 'lib/ansible/playbook/role', 'ssh': 'lib/ansible/plugins/connection/ssh.py', 'ssh authentication': 'lib/ansible/plugins/connection/ssh.py', 'setup / facts': 'lib/ansible/modules/system/setup.py', 'setup': 'lib/ansible/modules/system/setup.py', 'task executor': 'lib/ansible/executor/task_executor.py', 'testing': 'test/', 'validate-modules': 'test/sanity/validate-modules', 'vault': 'lib/ansible/parsing/vault', 'vault edit': 'lib/ansible/parsing/vault', 'vault documentation': 'lib/ansible/parsing/vault', 'with_items': 'lib/ansible/playbook/loop_control.py', 'windows modules': 'lib/ansible/modules/windows', 'winrm': 'lib/ansible/plugins/connection/winrm.py' } def __init__(self, gitrepo=None, botmetafile=None, cachedir=None, email_cache=None, file_indexer=None): self.cachedir = cachedir self.botmetafile = botmetafile self.email_cache = email_cache if file_indexer: self.file_indexer = file_indexer else: self.file_indexer = FileIndexer( botmetafile=self.botmetafile, checkoutdir=self.cachedir ) if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=self.REPO) self.strategy = None self.strategies = [] self.indexed_at = False self.updated_at = None self.update() def update(self, email_cache=None): if email_cache: self.email_cache = email_cache self.gitrepo.update() self.index_files() self.indexed_at = datetime.datetime.now() self.cache_keywords() self.updated_at = datetime.datetime.now() def index_files(self): self.BOTMETA = {} self.MODULES = {} self.MODULE_NAMES = [] self.MODULE_NAMESPACE_DIRECTORIES = [] self.load_meta() for fn in self.gitrepo.module_files: if os.path.isdir(fn): continue mname = os.path.basename(fn) mname = mname.replace('.py', '').replace('.ps1', '') if mname.startswith('__'): continue mdata = { 'name': mname, 'repo_filename': fn, 'filename': fn } if fn not in self.MODULES: self.MODULES[fn] = mdata.copy() else: self.MODULES[fn].update(mdata) self.MODULE_NAMESPACE_DIRECTORIES = [os.path.dirname(x) for x in self.gitrepo.module_files] self.MODULE_NAMESPACE_DIRECTORIES = sorted(set(self.MODULE_NAMESPACE_DIRECTORIES)) # make a list of names by enumerating the files self.MODULE_NAMES = [os.path.basename(x) for x in self.gitrepo.module_files] self.MODULE_NAMES = [x for x in self.MODULE_NAMES if x.endswith('.py') or x.endswith('.ps1')] self.MODULE_NAMES = [x.replace('.ps1', '').replace('.py', '') for x in self.MODULE_NAMES] self.MODULE_NAMES = [x for x in self.MODULE_NAMES if not x.startswith('__')] self.MODULE_NAMES = sorted(set(self.MODULE_NAMES)) # make a list of names by calling ansible-doc checkoutdir = self.gitrepo.checkoutdir checkoutdir = os.path.abspath(checkoutdir) cmd = '. {}/hacking/env-setup; ansible-doc -t module -F'.format(checkoutdir) logging.debug(cmd) (rc, so, se) = run_command(cmd, cwd=checkoutdir) if rc: raise Exception("'ansible-doc' command failed (%s, %s %s)" % (rc, so, se)) lines = so.split('\n') for line in lines: parts = line.split() parts = [x.strip() for x in parts] if len(parts) != 2 or checkoutdir not in line: continue mname = parts[0] if mname not in self.MODULE_NAMES: self.MODULE_NAMES.append(mname) fpath = parts[1] fpath = fpath.replace(checkoutdir + '/', '') if fpath not in self.MODULES: self.MODULES[fpath] = { 'name': mname, 'repo_filename': fpath, 'filename': fpath } _modules = self.MODULES.copy() for k, v in _modules.items(): kparts = os.path.splitext(k) if kparts[-1] == '.ps1': _k = kparts[0] + '.py' checkpath = os.path.join(checkoutdir, _k) if not os.path.isfile(checkpath): _k = k else: _k = k ME = ModuleExtractor(os.path.join(checkoutdir, _k), email_cache=self.email_cache) if k not in self.BOTMETA['files']: self.BOTMETA['files'][k] = { 'deprecated': os.path.basename(k).startswith('_'), 'labels': os.path.dirname(k).split('/'), 'authors': ME.authors, 'maintainers': ME.authors, 'maintainers_keys': [], 'notified': ME.authors, 'ignored': [], 'support': ME.metadata.get('supported_by', 'community'), 'metadata': ME.metadata.copy() } else: bmeta = self.BOTMETA['files'][k].copy() bmeta['metadata'] = ME.metadata.copy() if 'notified' not in bmeta: bmeta['notified'] = [] if 'maintainers' not in bmeta: bmeta['maintainers'] = [] if not bmeta.get('supported_by'): bmeta['supported_by'] = ME.metadata.get('supported_by', 'community') if 'authors' not in bmeta: bmeta['authors'] = [] for x in ME.authors: if x not in bmeta['authors']: bmeta['authors'].append(x) if x not in bmeta['maintainers']: bmeta['maintainers'].append(x) if x not in bmeta['notified']: bmeta['notified'].append(x) if not bmeta.get('labels'): bmeta['labels'] = os.path.dirname(k).split('/') bmeta['deprecated'] = os.path.basename(k).startswith('_') self.BOTMETA['files'][k].update(bmeta) # clean out the ignorees if 'ignored' in self.BOTMETA['files'][k]: for ignoree in self.BOTMETA['files'][k]['ignored']: for thiskey in ['maintainers', 'notified']: while ignoree in self.BOTMETA['files'][k][thiskey]: self.BOTMETA['files'][k][thiskey].remove(ignoree) # write back to the modules self.MODULES[k].update(self.BOTMETA['files'][k]) def load_meta(self): if self.botmetafile is not None: with open(self.botmetafile, 'rb') as f: rdata = f.read() else: fp = '.github/BOTMETA.yml' rdata = self.gitrepo.get_file_content(fp) self.BOTMETA = BotMetadataParser.parse_yaml(rdata) def cache_keywords(self): for k, v in self.BOTMETA['files'].items(): if not v.get('keywords'): continue for kw in v['keywords']: if kw not in self.KEYWORDS: self.KEYWORDS[kw] = k def clean_body(self, body, internal=False): body = body.lower() body = body.strip() for SC in self.STOPCHARS: if body.startswith(SC): body = body.lstrip(SC) body = body.strip() if body.endswith(SC): body = body.rstrip(SC) body = body.strip() if internal and SC in body: body = body.replace(SC, '') body = body.strip() body = body.strip() return body def match(self, issuewrapper): iw = issuewrapper matchdata = self.match_components( iw.title, iw.body, iw.template_data.get('component_raw'), files=iw.files ) return matchdata def match_components(self, title, body, component, files=None): """Make a list of matching files with metadata""" self.strategy = None self.strategies = [] # No matching necessary for PRs, but should provide consistent api if files: matched_filenames = files[:] else: matched_filenames = [] if component is None: return matched_filenames component = component.encode('ascii', 'ignore') logging.debug('match "{}"'.format(component)) delimiters = ['\n', ',', ' + ', ' & '] delimited = False for delimiter in delimiters: if delimiter in component: delimited = True components = component.split(delimiter) for _component in components: _matches = self._match_component(title, body, _component) self.strategies.append(self.strategy) # bypass for blacklist if None in _matches: _matches = [] matched_filenames += _matches # do not process any more delimiters break if not delimited: matched_filenames += self._match_component(title, body, component) self.strategies.append(self.strategy) # bypass for blacklist if None in matched_filenames: return [] # reduce subpaths if matched_filenames: matched_filenames = self.reduce_filepaths(matched_filenames) # create metadata for each matched file component_matches = [] matched_filenames = sorted(set(matched_filenames)) for fn in matched_filenames: component_matches.append(self.get_meta_for_file(fn)) return component_matches def _match_component(self, title, body, component): """Find matches for a single line""" matched_filenames = [] # context sets the path prefix to narrow the search window if 'module_util' in title.lower() or 'module_util' in component.lower(): context = 'lib/ansible/module_utils' elif 'module util' in title.lower() or 'module util' in component.lower(): context = 'lib/ansible/module_utils' elif 'module' in title.lower() or 'module' in component.lower(): context = 'lib/ansible/modules' elif 'dynamic inventory' in title.lower() or 'dynamic inventory' in component.lower(): context = 'contrib/inventory' elif 'inventory script' in title.lower() or 'inventory script' in component.lower(): context = 'contrib/inventory' elif 'inventory plugin' in title.lower() or 'inventory plugin' in component.lower(): context = 'lib/ansible/plugins/inventory' else: context = None if not component: return [] if component not in self.STOPWORDS and component not in self.STOPCHARS: if not matched_filenames: matched_filenames += self.search_by_keywords(component, exact=True) if matched_filenames: self.strategy = 'search_by_keywords' if not matched_filenames: matched_filenames += self.search_by_module_name(component) if matched_filenames: self.strategy = 'search_by_module_name' if not matched_filenames: matched_filenames += self.search_by_regex_module_globs(component) if matched_filenames: self.strategy = 'search_by_regex_module_globs' if not matched_filenames: matched_filenames += self.search_by_regex_modules(component) if matched_filenames: self.strategy = 'search_by_regex_modules' if not matched_filenames: matched_filenames += self.search_by_regex_generic(component) if matched_filenames: self.strategy = 'search_by_regex_generic' if not matched_filenames: matched_filenames += self.search_by_regex_urls(component) if matched_filenames: self.strategy = 'search_by_regex_urls' if not matched_filenames: matched_filenames += self.search_by_tracebacks(component) if matched_filenames: self.strategy = 'search_by_tracebacks' if not matched_filenames: matched_filenames += self.search_by_filepath(component, context=context) if matched_filenames: self.strategy = 'search_by_filepath' if not matched_filenames: matched_filenames += self.search_by_filepath(component, partial=True) if matched_filenames: self.strategy = 'search_by_filepath[partial]' if not matched_filenames: matched_filenames += self.search_by_keywords(component, exact=False) if matched_filenames: self.strategy = 'search_by_keywords!exact' if matched_filenames: matched_filenames += self.include_modules_from_test_targets(matched_filenames) return matched_filenames def search_by_module_name(self, component): matches = [] component = self.clean_body(component) # docker-container vs. docker_container if component not in self.MODULE_NAMES: component = component.replace('-', '_') if component in self.MODULE_NAMES: mmatch = self.find_module_match(component) if mmatch: if isinstance(mmatch, list): for x in mmatch: matches.append(x['repo_filename']) else: matches.append(mmatch['repo_filename']) return matches def search_by_keywords(self, component, exact=True): """Simple keyword search""" component = component.lower() matches = [] if component in self.STOPWORDS: matches = [None] elif component in self.KEYWORDS: matches = [self.KEYWORDS[component]] elif not exact: for k, v in self.KEYWORDS.items(): if ' ' + k + ' ' in component or ' ' + k + ' ' in component.lower(): logging.debug('keyword match: {}'.format(k)) matches.append(v) elif ' ' + k + ':' in component or ' ' + k + ':' in component: logging.debug('keyword match: {}'.format(k)) matches.append(v) elif component.endswith(' ' + k) or component.lower().endswith(' ' + k): logging.debug('keyword match: {}'.format(k)) matches.append(v) elif (k in component or k in component.lower()) and k in self.BLACKLIST: logging.debug('blacklist match: {}'.format(k)) matches.append(None) return matches def search_by_regex_urls(self, body): # http://docs.ansible.com/ansible/latest/copy_module.html # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html # http://docs.ansible.com/ansible/latest/postgresql_db_module.html # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html) # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html) # https//github.com/ansible/ansible/blob/devel/lib/ansible/modules/windows/win_dsc.ps1L228 matches = [] urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', body ) if urls: for url in urls: url = url.rstrip(')') if '/blob' in url and url.endswith('.py'): parts = url.split('/') bindex = parts.index('blob') fn = '/'.join(parts[bindex+2:]) matches.append(fn) elif '_module.html' in url: parts = url.split('/') fn = parts[-1].replace('_module.html', '') choices = [x for x in self.gitrepo.files if '/' + fn in x or '/_' + fn in x] choices = [x for x in choices if 'lib/ansible/modules' in x] if len(choices) > 1: choices = [x for x in choices if '/' + fn + '.py' in x or '/' + fn + '.ps1' in x or '/_' + fn + '.py' in x] if not choices: pass elif len(choices) == 1: matches.append(choices[0]) else: pass else: pass return matches def search_by_regex_modules(self, body): # foo module # foo and bar modules # foo* modules # foo* module body = body.lower() logging.debug('attempt regex match on: {}'.format(body)) # https://www.tutorialspoint.com/python/python_reg_expressions.htm patterns = [ r'\:\n(\S+)\.py', r'(\S+)\.py', r'\-(\s+)(\S+)(\s+)module', r'\`ansible_module_(\S+)\.py\`', r'module(\s+)\-(\s+)(\S+)', r'module(\s+)(\S+)', r'\`(\S+)\`(\s+)module', r'(\S+)(\s+)module', r'the (\S+) command', r'(\S+) \(.*\)', r'(\S+)\-module', r'modules/(\S+)', r'module\:(\s+)\`(\S+)\`', r'module\: (\S+)', r'module (\S+)', r'module `(\S+)`', r'module: (\S+)', r'new (\S+) module', r'the (\S+) module', r'the \"(\S+)\" module', r':\n(\S+) module', r'(\S+) module', r'(\S+) core module', r'(\S+) extras module', r':\n\`(\S+)\` module', r'\`(\S+)\` module', r'`(\S+)` module', r'(\S+)\* modules', r'(\S+) and (\S+)', r'(\S+) or (\S+)', r'(\S+) \+ (\S+)', r'(\S+) \& (\S)', r'(\S+) and (\S+) modules', r'(\S+) or (\S+) module', r'(\S+)_module', r'action: (\S+)', r'action (\S+)', r'ansible_module_(\S+)\.py', r'ansible_module_(\S+)', r'ansible_modules_(\S+)\.py', r'ansible_modules_(\S+)', r'(\S+) task', r'(\s+)\((\S+)\)', r'(\S+)(\s+)(\S+)(\s+)modules', r'(\S+)(\s+)module\:(\s+)(\S+)', r'\-(\s+)(\S+)(\s+)module', r'\:(\s+)(\S+)(\s+)module', r'\-(\s+)ansible(\s+)(\S+)(\s+)(\S+)(\s+)module', r'.*(\s+)(\S+)(\s+)module.*' ] matches = [] logging.debug('check patterns against: {}'.format(body)) for pattern in patterns: mobj = re.match(pattern, body, re.M | re.I) if mobj: logging.debug('pattern {} matched on "{}"'.format(pattern, body)) for x in range(0, mobj.lastindex+1): try: mname = mobj.group(x) logging.debug('mname: {}'.format(mname)) if mname == body: continue mname = self.clean_body(mname) if not mname.strip(): continue mname = mname.strip().lower() if ' ' in mname: continue if '/' in mname: continue mname = mname.replace('.py', '').replace('.ps1', '') logging.debug('--> {}'.format(mname)) # attempt to match a module module_match = self.find_module_match(mname) if not module_match: pass elif isinstance(module_match, list): for m in module_match: matches.append(m['repo_filename']) elif isinstance(module_match, dict): matches.append(module_match['repo_filename']) except Exception as e: logging.error(e) if matches: break return matches def search_by_regex_module_globs(self, body): # All AWS modules # BigIP modules # NXOS modules # azurerm modules matches = [] body = self.clean_body(body) logging.debug('try globs on: {}'.format(body)) keymap = { 'all': None, 'ec2': 'lib/ansible/modules/cloud/amazon', 'ec2_*': 'lib/ansible/modules/cloud/amazon', 'aws': 'lib/ansible/modules/cloud/amazon', 'amazon': 'lib/ansible/modules/cloud/amazon', 'google': 'lib/ansible/modules/cloud/google', 'gce': 'lib/ansible/modules/cloud/google', 'gcp': 'lib/ansible/modules/cloud/google', 'bigip': 'lib/ansible/modules/network/f5', 'nxos': 'lib/ansible/modules/network/nxos', 'azure': 'lib/ansible/modules/cloud/azure', 'azurerm': 'lib/ansible/modules/cloud/azure', 'openstack': 'lib/ansible/modules/cloud/openstack', 'ios': 'lib/ansible/modules/network/ios', } regexes = [ r'(\S+) ansible modules', r'all (\S+) based modules', r'all (\S+) modules', r'.* all (\S+) modules.*', r'(\S+) modules', r'(\S+\*) modules', r'all cisco (\S+\*) modules', ] mobj = None for x in regexes: mobj = re.match(x, body) if mobj: logging.debug('matched glob: {}'.format(x)) break if not mobj: logging.debug('no glob matches') if mobj: keyword = mobj.group(1) if not keyword.strip(): pass elif keyword in keymap: if keymap[keyword]: matches.append(keymap[keyword]) else: if '*' in keyword: keyword = keyword.replace('*', '') # check for directories first fns = [x for x in self.MODULE_NAMESPACE_DIRECTORIES if keyword in x] # check for files second if not fns: fns = [x for x in self.gitrepo.module_files if 'lib/ansible/modules' in x and keyword in x] if fns: matches += fns if matches: matches = sorted(set(matches)) return matches def search_by_regex_generic(self, body): # foo dynamic inventory script # foo filter # https://www.tutorialspoint.com/python/python_reg_expressions.htm patterns = [ [r'(.*) action plugin', 'lib/ansible/plugins/action'], [r'(.*) inventory plugin', 'lib/ansible/plugins/inventory'], [r'(.*) dynamic inventory', 'contrib/inventory'], [r'(.*) dynamic inventory (script|file)', 'contrib/inventory'], [r'(.*) inventory script', 'contrib/inventory'], [r'(.*) filter', 'lib/ansible/plugins/filter'], [r'(.*) jinja filter', 'lib/ansible/plugins/filter'], [r'(.*) jinja2 filter', 'lib/ansible/plugins/filter'], [r'(.*) template filter', 'lib/ansible/plugins/filter'], [r'(.*) fact caching plugin', 'lib/ansible/plugins/cache'], [r'(.*) fact caching module', 'lib/ansible/plugins/cache'], [r'(.*) lookup plugin', 'lib/ansible/plugins/lookup'], [r'(.*) lookup', 'lib/ansible/plugins/lookup'], [r'(.*) callback plugin', 'lib/ansible/plugins/callback'], [r'(.*)\.py callback', 'lib/ansible/plugins/callback'], [r'callback plugin (.*)', 'lib/ansible/plugins/callback'], [r'(.*) stdout callback', 'lib/ansible/plugins/callback'], [r'stdout callback (.*)', 'lib/ansible/plugins/callback'], [r'stdout_callback (.*)', 'lib/ansible/plugins/callback'], [r'(.*) callback plugin', 'lib/ansible/plugins/callback'], [r'(.*) connection plugin', 'lib/ansible/plugins/connection'], [r'(.*) connection type', 'lib/ansible/plugins/connection'], [r'(.*) connection', 'lib/ansible/plugins/connection'], [r'(.*) transport', 'lib/ansible/plugins/connection'], [r'connection=(.*)', 'lib/ansible/plugins/connection'], [r'connection: (.*)', 'lib/ansible/plugins/connection'], [r'connection (.*)', 'lib/ansible/plugins/connection'], [r'strategy (.*)', 'lib/ansible/plugins/strategy'], [r'(.*) strategy plugin', 'lib/ansible/plugins/strategy'], [r'(.*) module util', 'lib/ansible/module_utils'], [r'ansible-galaxy (.*)', 'lib/ansible/galaxy'], [r'ansible-playbook (.*)', 'lib/ansible/playbook'], [r'ansible/module_utils/(.*)', 'lib/ansible/module_utils'], [r'module_utils/(.*)', 'lib/ansible/module_utils'], [r'lib/ansible/module_utils/(.*)', 'lib/ansible/module_utils'], [r'(\S+) documentation fragment', 'lib/ansible/utils/module_docs_fragments'], ] body = self.clean_body(body) matches = [] for pattern in patterns: mobj = re.match(pattern[0], body, re.M | re.I) if mobj: logging.debug('pattern hit: {}'.format(pattern)) fname = mobj.group(1) fname = fname.lower() fpath = os.path.join(pattern[1], fname) if fpath in self.gitrepo.files: matches.append(fpath) elif os.path.join(pattern[1], fname + '.py') in self.gitrepo.files: fname = os.path.join(pattern[1], fname + '.py') matches.append(fname) else: # fallback to the directory matches.append(pattern[1]) return matches def search_by_tracebacks(self, body): matches = [] if 'Traceback (most recent call last)' in body: lines = body.split('\n') for line in lines: line = line.strip() if line.startswith('DistributionNotFound'): matches = ['setup.py'] break elif line.startswith('File'): fn = line.split()[1] for SC in self.STOPCHARS: fn = fn.replace(SC, '') if 'ansible_module_' in fn: fn = os.path.basename(fn) fn = fn.replace('ansible_module_', '') matches = [fn] elif 'cli/playbook.py' in fn: fn = 'lib/ansible/cli/playbook.py' elif 'module_utils' in fn: idx = fn.find('module_utils/') fn = 'lib/ansible/' + fn[idx:] elif 'ansible/' in fn: idx = fn.find('ansible/') fn1 = fn[idx:] if 'bin/' in fn1: if not fn1.startswith('bin'): idx = fn1.find('bin/') fn1 = fn1[idx:] if fn1.endswith('.py'): fn1 = fn1.rstrip('.py') elif 'cli/' in fn1: idx = fn1.find('cli/') fn1 = fn1[idx:] fn1 = 'lib/ansible/' + fn1 elif 'lib' not in fn1: fn1 = 'lib/' + fn1 if fn1 not in self.files: pass return matches def search_by_filepath(self, body, partial=False, context=None): """Find known filepaths in body""" matches = [] body = self.clean_body(body) if not body: return [] if body.lower() in self.STOPCHARS: return [] if body.lower() in self.STOPWORDS: return [] # 'inventory manager' vs. 'inventory/manager' if partial and ' ' in body: body = body.replace(' ', '/') if 'site-packages' in body: res = re.match('(.*)/site-packages/(.*)', body) body = res.group(2) if 'modules/core/' in body: body = body.replace('modules/core/', 'modules/') if 'modules/extras/' in body: body = body.replace('modules/extras/', 'modules/') if 'ansible-modules-core/' in body: body = body.replace('ansible-modules-core/', '/') if 'ansible-modules-extras/' in body: body = body.replace('ansible-modules-extras/', '/') if body.startswith('ansible/lib/ansible'): body = body.replace('ansible/lib', 'lib') if body.startswith('ansible/') and not body.startswith('ansible/modules'): body = body.replace('ansible/', '', 1) if 'module/' in body: body = body.replace('module/', 'modules/') logging.debug('search filepath [{}] [{}]: {}'.format(context, partial, body)) if len(body) < 2: return [] if '/' in body: body_paths = body.split('/') elif ' ' in body: body_paths = body.split() body_paths = [x.strip() for x in body_paths if x.strip()] else: body_paths = [body] if 'networking' in body_paths: ix = body_paths.index('networking') body_paths[ix] = 'network' if 'plugin' in body_paths: ix = body_paths.index('plugin') body_paths[ix] = 'plugins' if not context or 'lib/ansible/modules' in context: mmatch = self.find_module_match(body) if mmatch: if isinstance(mmatch, list) and len(mmatch) > 1: # only allow for exact prefix globbing here ... if [x for x in mmatch if x['repo_filename'].startswith(body)]: return [x['repo_filename'] for x in mmatch] elif isinstance(mmatch, list): return [x['repo_filename'] for x in mmatch] else: return [mmatch['repo_filename']] if body in self.gitrepo.files: matches = [body] else: for fn in self.gitrepo.files: # limit the search set if a context is given if context is not None and not fn.startswith(context): continue if fn.endswith(body) or fn.endswith(body + '.py') or fn.endswith(body + '.ps1'): # ios_config.py -> test_ios_config.py vs. ios_config.py bn1 = os.path.basename(body) bn2 = os.path.basename(fn) if bn2.startswith(bn1): matches = [fn] break if partial: # netapp_e_storagepool storage module # lib/ansible/modules/storage/netapp/netapp_e_storagepool.py # if all subpaths are in this filepath, it is a match bp_total = 0 fn_paths = fn.split('/') fn_paths.append(fn_paths[-1].replace('.py', '').replace('.ps1', '')) for bp in body_paths: if bp in fn_paths: bp_total += 1 if bp_total == len(body_paths): matches = [fn] break elif bp_total > 1: if (float(bp_total) / float(len(body_paths))) >= (2.0 / 3.0): if fn not in matches: matches.append(fn) if matches: tr = [] for match in matches[:]: # reduce to longest path for m in matches: if match == m: continue if len(m) < match and match.startswith(m): tr.append(m) for r in tr: if r in matches: logging.debug('trimming {}'.format(r)) matches.remove(r) matches = sorted(set(matches)) logging.debug('return: {}'.format(matches)) return matches def reduce_filepaths(self, matches): # unique _matches = [] for _match in matches: if _match not in _matches: _matches.append(_match) matches = _matches[:] # squash to longest path if matches: tr = [] for match in matches[:]: # reduce to longest path for m in matches: if match == m: continue if m is None or match is None: continue if len(m) < match and match.startswith(m) or match.endswith(m): tr.append(m) for r in tr: if r in matches: matches.remove(r) return matches def include_modules_from_test_targets(self, matches): """Map test targets to the module files""" new_matches = [] for match in matches: if not match: continue # include modules from test targets if 'test/integration/targets' in match: paths = match.split('/') tindex = paths.index('targets') mname = paths[tindex+1] mrs = self.find_module_match(mname, exact=True) if mrs: if not isinstance(mrs, list): mrs = [mrs] for mr in mrs: new_matches.append(mr['repo_filename']) return new_matches def get_meta_for_file(self, filename): meta = { 'repo_filename': filename, 'name': os.path.basename(filename).split('.')[0], 'notify': [], 'assign': [], 'authors': [], 'committers': [], 'maintainers': [], 'labels': [], 'ignore': [], 'support': None, 'supported_by': None, 'deprecated': False, 'topic': None, 'subtopic': None, 'namespace': None, 'namespace_maintainers': [] } populated = False filenames = [filename, os.path.splitext(filename)[0]] # powershell meta is in the python file if filename.endswith('.ps1'): pyfile = filename.replace('.ps1', '.py') if pyfile in self.BOTMETA['files']: filenames.append(pyfile) botmeta_entries = self.file_indexer._filenames_to_keys(filenames) for entry in botmeta_entries: fdata = self.BOTMETA['files'][entry].copy() if 'authors' in fdata: meta['authors'] = fdata['authors'] if 'maintainers' in fdata: meta['notify'] += fdata['maintainers'] meta['assign'] += fdata['maintainers'] meta['maintainers'] += fdata['maintainers'] if 'notified' in fdata: meta['notify'] += fdata['notified'] if 'labels' in fdata: meta['labels'] += fdata['labels'] if 'ignore' in fdata: meta['ignore'] += fdata['ignore'] if 'ignored' in fdata: meta['ignore'] += fdata['ignored'] if 'support' in fdata: if isinstance(fdata['support'], list): meta['support'] = fdata['support'][0] else: meta['support'] = fdata['support'] elif 'supported_by' in fdata: if isinstance(fdata['supported_by'], list): meta['support'] = fdata['supported_by'][0] else: meta['support'] = fdata['supported_by'] if 'deprecated' in fdata: meta['deprecated'] = fdata['deprecated'] populated = True # walk up the tree for more meta paths = filename.split('/') for idx, x in enumerate(paths): thispath = '/'.join(paths[:(0-idx)]) if thispath in self.BOTMETA['files']: fdata = self.BOTMETA['files'][thispath].copy() if 'support' in fdata and not meta['support']: if isinstance(fdata['support'], list): meta['support'] = fdata['support'][0] else: meta['support'] = fdata['support'] if 'labels' in fdata: meta['labels'] += fdata['labels'] if 'maintainers' in fdata: meta['notify'] += fdata['maintainers'] meta['assign'] += fdata['maintainers'] meta['maintainers'] += fdata['maintainers'] if 'ignore' in fdata: meta['ignore'] += fdata['ignore'] if 'notified' in fdata: meta['notify'] += fdata['notified'] if 'lib/ansible/modules' in filename: topics = [x for x in paths if x not in ['lib', 'ansible', 'modules']] topics = [x for x in topics if x != os.path.basename(filename)] if len(topics) == 2: meta['topic'] = topics[0] meta['subtopic'] = topics[1] elif len(topics) == 1: meta['topic'] = topics[0] meta['namespace'] = '/'.join(topics) # set namespace maintainers (skip !modules for now) if filename.startswith('lib/ansible/modules'): ns = meta.get('namespace') keys = self.BOTMETA['files'].keys() keys = [x for x in keys if x.startswith(os.path.join('lib/ansible/modules', ns))] ignored = [] for key in keys: meta['namespace_maintainers'] += self.BOTMETA['files'][key].get('maintainers', []) ignored += self.BOTMETA['files'][key].get('ignored', []) for ignoree in ignored: while ignoree in meta['namespace_maintainers']: meta['namespace_maintainers'].remove(ignoree) # new modules should default to "community" support if filename.startswith('lib/ansible/modules') and filename not in self.gitrepo.files: meta['support'] = 'community' meta['supported_by'] = 'community' # test targets for modules should inherit from their modules if filename.startswith('test/integration/targets') and filename not in self.BOTMETA['files']: whitelist = [ 'labels', 'ignore', 'deprecated', 'authors', 'assign', 'maintainers', 'notify', 'topic', 'subtopic', 'support' ] paths = filename.split('/') tindex = paths.index('targets') mname = paths[tindex+1] mmatch = self._find_module_match(mname, exact=True) if mmatch: mmeta = self.get_meta_for_file(mmatch[0]['repo_filename']) for k, v in mmeta.items(): if k in whitelist and v: if isinstance(meta[k], list): meta[k] = sorted(set(meta[k] + v)) elif not meta[k]: meta[k] = v # make new test targets community by default if not meta['support'] and not meta['supported_by']: meta['support'] = 'community' # it's okay to remove things from legacy-files.txt if filename == 'test/sanity/pep8/legacy-files.txt' and not meta['support']: meta['support'] = 'community' # fallback to core support if not meta['support']: meta['support'] = 'core' # align support and supported_by if meta['support'] != meta['supported_by']: if meta['support'] and not meta['supported_by']: meta['supported_by'] = meta['support'] elif not meta['support'] and meta['supported_by']: meta['support'] = meta['supported_by'] # clean up the result _meta = meta.copy() for k, v in _meta.items(): if isinstance(v, list): meta[k] = sorted(set(v)) # walk up the botmeta tree looking for ignores to include if meta.get('repo_filename'): namespace_paths = os.path.dirname(meta['repo_filename']) namespace_paths = namespace_paths.split('/') for x in reversed(range(0, len(namespace_paths) + 1)): this_ns_path = '/'.join(namespace_paths[:x]) if not this_ns_path: continue print('check {}'.format(this_ns_path)) if this_ns_path in self.BOTMETA['files']: this_ignore = self.BOTMETA['files'][this_ns_path].get('ignore') or \ self.BOTMETA['files'][this_ns_path].get('ignored') or \ self.BOTMETA['files'][this_ns_path].get('ignores') print('ignored: {}'.format(this_ignore)) if this_ignore: for username in this_ignore: if username not in meta['ignore']: meta['ignore'].append(username) # process ignores AGAIN. if meta.get('ignore'): for k, v in meta.items(): if k == 'ignore': continue if not isinstance(v, list): continue for ignoree in meta['ignore']: if ignoree in v: meta[k].remove(ignoree) return meta def find_module_match(self, pattern, exact=False): '''Exact module name matching''' logging.debug('find_module_match for "{}"'.format(pattern)) candidate = None BLACKLIST = [ 'module_utils', 'callback', 'network modules', 'networking modules' 'windows modules' ] if not pattern or pattern is None: return None # https://github.com/ansible/ansible/issues/19755 if pattern == 'setup': pattern = 'lib/ansible/modules/system/setup.py' if '/facts.py' in pattern or ' facts.py' in pattern: pattern = 'lib/ansible/modules/system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if '-' in pattern: pattern = pattern.replace('-', '_') if 'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif 'callback' in pattern: return None elif 'lookup' in pattern: return None elif 'contrib' in pattern and 'inventory' in pattern: return None elif pattern.lower() in BLACKLIST: return None candidate = self._find_module_match(pattern, exact=exact) if not candidate: candidate = self._find_module_match(os.path.basename(pattern)) if not candidate and '/' in pattern and not pattern.startswith('lib/'): ppy = None ps1 = None if not pattern.endswith('.py') and not pattern.endswith('.ps1'): ppy = pattern + '.py' if not pattern.endswith('.py') and not pattern.endswith('.ps1'): ps1 = pattern + '.ps1' for mf in self.gitrepo.module_files: if pattern in mf: if mf.endswith(pattern) or mf.endswith(ppy) or mf.endswith(ps1): candidate = mf break return candidate def _find_module_match(self, pattern, exact=False): logging.debug('matching on {}'.format(pattern)) matches = [] if isinstance(pattern, unicode): pattern = pattern.encode('ascii', 'ignore') logging.debug('_find_module_match: {}'.format(pattern)) noext = pattern.replace('.py', '').replace('.ps1', '') # exact is looking for a very precise name such as "vmware_guest" if exact: candidates = [pattern] else: candidates = [pattern, '_' + pattern, noext, '_' + noext] for k, v in self.MODULES.items(): if v['name'] in candidates: logging.debug('match {} on name: {}'.format(k, v['name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in self.MODULES.items(): if k == pattern: logging.debug('match {} on key: {}'.format(k, k)) matches = [v] break # spellcheck if not exact and not matches and '/' not in pattern: _pattern = pattern if not isinstance(_pattern, unicode): _pattern = _pattern.decode('utf-8') candidates = [] for k, v in self.MODULES.items(): vname = v['name'] if not isinstance(vname, unicode): vname = vname.decode('utf-8') jw = jaro_winkler(vname, _pattern) if jw > .9: candidates.append((jw, k)) for candidate in candidates: matches.append(self.MODULES[candidate[1]]) return matches
def index_ecosystem(self): # index the ansible-collections org token = C.DEFAULT_GITHUB_TOKEN gh = Github(login_or_token=token) gw = GithubWrapper(gh, cachedir=self.cachedir) ac = gw.get_org('ansible-collections') cloneurls = set() for repo in ac.get_repos(): #print(repo) cloneurls.add(repo.clone_url) cloneurls = [x.replace('.git', '') for x in cloneurls] for curl in cloneurls: if curl.endswith('/overview'): continue if curl.endswith('/collection_template'): continue if curl.endswith('/.github'): continue if curl.endswith('/hub'): continue grepo = GitRepoWrapper(cachedir=self.cachedir, repo=curl, rebase=False) # is there a galaxy.yml at the root level? if grepo.exists('galaxy.yml'): meta = yaml.load(grepo.get_file_content('galaxy.yml')) fqcn = '%s.%s' % (meta['namespace'], meta['name']) self._gitrepos[fqcn] = grepo else: # multi-collection repos ... sigh. galaxyfns = grepo.find('galaxy.yml') if galaxyfns: for gfn in galaxyfns: meta = yaml.load(grepo.get_file_content(gfn)) fqcn = '%s.%s' % (meta['namespace'], meta['name']) _grepo = GitRepoWrapper(cachedir=self.cachedir, repo=curl, rebase=False, context=os.path.dirname(gfn)) self._gitrepos[fqcn] = _grepo else: fqcn = None bn = os.path.basename(curl) # enumerate the url? if '.' in bn: fqcn = bn # try the README? if fqcn is None: for fn in ['README.rst', 'README.md']: if fqcn: break if not grepo.exists(fn): continue fdata = grepo.get_file_content(fn) if not '.' in fdata: continue lines = fdata.split('\n') for line in lines: line = line.strip() if line.lower().startswith( 'ansible collection:'): fqcn = line.split(':')[-1].strip() break # lame ... if fqcn is None: fqcn = bn + '._community' self._gitrepos[fqcn] = grepo # scrape the galaxy collections api nexturl = self._baseurl + '/api/v2/collections/?page_size=1000' while nexturl: jdata = self._get_cached_url(nexturl) nexturl = jdata.get('next_link') if nexturl: nexturl = self._baseurl + nexturl for res in jdata.get('results', []): fqcn = '%s.%s' % (res['namespace']['name'], res['name']) if res.get('deprecated'): continue if fqcn in self._gitrepos: continue lv = res['latest_version']['href'] lvdata = self._get_cached_url(lv) rurl = lvdata.get('metadata', {}).get('repository') if rurl is None: rurl = lvdata['download_url'] grepo = GitRepoWrapper(cachedir=self.cachedir, repo=rurl, rebase=False) self._gitrepos[fqcn] = grepo # reconcile all things ... self.GALAXY_FQCNS = sorted(set(self._gitrepos.keys())) self.GALAXY_FILES = {} for fqcn, gr in self._gitrepos.items(): if fqcn.startswith('testing.'): continue for fn in gr.files: if fn not in self.GALAXY_FILES: self.GALAXY_FILES[fn] = set() self.GALAXY_FILES[fn].add(fqcn)
def main(): tocheck = [ #32226, #30361, #31006, #58674, #63611, #64320, #66891, #68784, 69010, ] redirect = set() noredirect = set() nometa = set() cachedir = '/home/jtanner/.ansibullbot/cache' gitrepo = GitRepoWrapper(cachedir=cachedir, repo='https://github.com/ansible/ansible', commit=None, rebase=False) rdata = gitrepo.get_file_content(u'.github/BOTMETA.yml') botmeta = BotMetadataParser.parse_yaml(rdata) cm = AnsibleComponentMatcher(cachedir=cachedir, gitrepo=gitrepo, botmeta=botmeta, botmetafile=None, email_cache=None, usecache=True, use_galaxy=True) ''' mr = parse_match_results() for issue in sorted(mr.keys(), key=lambda x: int(x.split('/')[-1]), reverse=True): print(issue) number = int(issue.split('/')[-1]) #if number != 68709: # continue print(number) mfile = os.path.join('~/.ansibullbot/cache/ansible/ansible/issues/%s' % number, 'meta.json') mfile = os.path.expanduser(mfile) if os.path.exists(mfile): with open(mfile, 'r') as f: imeta = json.loads(f.read()) else: nometa.add(issue) imeta = {} if imeta: iw = MockIssueWrapper(issue, meta=imeta) cfacts = get_collection_facts(iw, cm, imeta) #pprint(cfacts) if cfacts.get('needs_collection_redirect') == True: redirect.add(issue) else: noredirect.add(issue) #if not imeta['is_backport']: # import epdb; epdb.st() ''' mmap = {} #gmatches = cm.search_ecosystem('contrib/inventory/ec2.py') #import epdb; epdb.st() mfiles = get_issues() for mfile in mfiles: with open(mfile, 'r') as f: imeta = json.loads(f.read()) print(imeta['html_url']) number = int(imeta['html_url'].split('/')[-1]) if number not in tocheck: continue newmeta = copy.deepcopy(imeta) iw = MockIssueWrapper(imeta['html_url'], meta=newmeta, gitrepo=gitrepo) #cmatches = cm.match_components(iw.title, iw.body, iw.component) cmmeta = get_component_match_facts(iw, cm, []) newmeta.update(cmmeta) cfmeta = get_collection_facts(iw, cm, newmeta) # check api deltas ... #cm1 = cm.match(iw) #cm2 = cm.match_components(iw.title, iw.body, iw.component, files=iw.files) #import epdb; epdb.st() print('component: %s' % iw.component) print(cmmeta['component_filenames']) #pprint(cfmeta) cf2vals = [x for x in list(cfmeta['collection_filemap'].values()) if x] cf1vals = [x for x in list(imeta['collection_filemap'].values()) if x] ''' if cf1vals or cf2vals: pprint(cf1vals) pprint(cf2vals) #import epdb; epdb.st() ''' ''' if cf2vals != cf1vals: pprint(cf1vals) pprint(cf2vals) import epdb; epdb.st() ''' pprint(cfmeta) import epdb epdb.st() print('# %s total issues|PRs without meta' % len(list(nometa))) print('# %s total issues|PRs not redirected to collections' % len(list(noredirect))) print('# %s total issues|PRs redirected to collections' % len(list(redirect))) import epdb epdb.st()
class ModuleIndexer(object): EMPTY_MODULE = { u'authors': [], u'name': None, u'namespaced_module': None, u'namespace_maintainers': [], u'deprecated': False, u'deprecated_filename': None, u'dirpath': None, u'filename': None, u'filepath': None, u'fulltopic': None, u'maintainers': [], u'_maintainers': [], u'maintainers_keys': None, u'metadata': {}, u'repo_filename': None, u'repository': u'ansible', u'subtopic': None, u'topic': None, u'imports': [] } def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = {} # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True) def update(self, force=False): '''Reload everything if there are new commits''' changed = self.gitrepo.manage_checkout() if changed or force: self.get_files() self.parse_metadata() def get_files(self): '''Cache a list of filenames in the checkout''' cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self.files = files def parse_metadata(self): if self.botmetafile is not None: with open(self.botmetafile, 'rb') as f: rdata = f.read() else: fp = u'.github/BOTMETA.yml' rdata = self.get_file_content(fp) self.botmeta = BotMetadataParser.parse_yaml(rdata) # load the modules logging.info(u'loading modules') self.get_ansible_modules() def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format(k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches def find_match(self, pattern, exact=False): '''Exact module name matching''' logging.debug(u'find_match for "{}"'.format(pattern)) BLACKLIST = [ u'module_utils', u'callback', u'network modules', u'networking modules' u'windows modules' ] if not pattern or pattern is None: return None if pattern.lower() == u'core': return None ''' if 'docs.ansible.com' in pattern and '_module.html' in pattern: # http://docs.ansible.com/ansible/latest/copy_module.html # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html # http://docs.ansible.com/ansible/latest/postgresql_db_module.html # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html) # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', pattern ) #urls = [x for x in urls if '_module.html' in x] #if urls: # import epdb; epdb.st() import epdb; epdb.st() ''' # https://github.com/ansible/ansible/issues/19755 if pattern == u'setup': pattern = u'system/setup.py' if u'/facts.py' in pattern or u' facts.py' in pattern: pattern = u'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if u'-' in pattern: pattern = pattern.replace(u'-', u'_') if u'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif u'callback' in pattern: return None elif u'lookup' in pattern: return None elif u'contrib' in pattern and u'inventory' in pattern: return None elif pattern.lower() in BLACKLIST: return None elif u'/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 if not pattern.startswith(u'lib/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + u'.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if isinstance(candidate, list): if len(candidate) == 1: candidate = candidate[0] if candidate[u'filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match(u'_' + bname) # unique the results if isinstance(match, list) and len(match) > 1: _match = [] for m in match: if m not in _match: _match.append(m) match = _match[:] return match def is_valid(self, mname): match = self.find_match(mname, exact=True) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname, exact=True) if match: return match[u'repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" matches = [] module_dir = os.path.join(self.gitrepo.checkoutdir, u'lib/ansible/modules') module_dir = os.path.expanduser(module_dir) for root, _, filenames in os.walk(module_dir): for filename in filenames: if u'lib/ansible/modules' in root and not filename == u'__init__.py': matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) self.populate_modules(matches) # custom fixes newitems = [] for k, v in six.iteritems(self.modules): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include.py'): self.modules[k][u'repository'] = u'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include_vars.py'): self.modules[k][u'repository'] = u'ansible' if k.endswith(u'/include_role.py'): self.modules[k][u'repository'] = u'ansible' # ansible maintains these if u'include' in k: self.modules[k][u'maintainers'] = [u'ansible'] # deprecated modules are annoying if v[u'name'].startswith(u'_'): dkey = os.path.dirname(v[u'filepath']) dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1)) if dkey not in self.modules: nd = v.copy() nd[u'name'] = nd[u'name'].replace(u'_', u'', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata logging.debug(u'set module metadata') self.set_module_metadata() # parse imports logging.debug(u'set module imports') self.set_module_imports() # last modified if self.get_commits: logging.debug(u'set module commits') self.get_module_commits() # parse blame if self.get_blames and self.get_commits: logging.debug(u'set module blames') self.get_module_blames() # depends on metadata now ... logging.debug(u'set module maintainers') self.set_maintainers() return self.modules def populate_modules(self, matches): # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict[u'filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'dirpath'] = dirpath filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'filepath'] = filepath mdict.update( self.split_topics_from_path(filepath) ) mdict[u'repo_filename'] = mdict[u'filepath']\ .replace(u'lib/ansible/modules/%s/' % mdict[u'repository'], u'') # clustering/consul mdict[u'namespaced_module'] = mdict[u'repo_filename'] mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.py', u'') mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.ps1', u'') mname = os.path.basename(match) mname = mname.replace(u'.py', u'') mname = mname.replace(u'.ps1', u'') mdict[u'name'] = mname # deprecated modules if mname.startswith(u'_'): mdict[u'deprecated'] = True deprecated_filename = \ os.path.dirname(mdict[u'namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + u'.py') mdict[u'deprecated_filename'] = deprecated_filename else: mdict[u'deprecated_filename'] = mdict[u'repo_filename'] self.modules[filepath] = mdict # meta is a special module self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules[u'meta'][u'name'] = u'meta' self.modules[u'meta'][u'repo_filename'] = u'meta' def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join( self.scraper_cache, k.replace(u'/', u'_') + u'.commits.pickle' ) if not os.path.isfile(pfile): refresh = True else: pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {} print(pfile) with open(pfile, 'rb') as f: pdata = pickle_load(f, **pickle_kwargs) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info(u'refresh commit cache for %s' % k) cmd = u'cd %s; git log --follow %s' % (self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split(u'\n'): if line.startswith(u'commit '): commit = { u'name': None, u'email': None, u'login': None, u'hash': line.split()[-1], u'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith(u'Author: '): line = line.replace(u'Author: ', u'') line = line.replace(u'<', u'') line = line.replace(u'>', u'') lparts = line.split() if u'@' in lparts[-1]: commit[u'email'] = lparts[-1] commit[u'name'] = u' '.join(lparts[:-1]) else: pass if commit[u'email'] and \ u'noreply.github.com' in commit[u'email']: commit[u'login'] = commit[u'email'].split(u'@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith(u'Date:'): dstr = line.split(u':', 1)[1].strip() dstr = u' '.join(dstr.split(u' ')[:-1]) ds = datetime.datetime.strptime( to_text(dstr), u'%a %b %d %H:%M:%S %Y' ) commit[u'date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle_dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): if filepath in self.commits: return self.commits[filepath][0][u'hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.gitrepo.checkoutdir, filepath) (rc, so, se) = run_command(cmd) return to_text(so).strip() def get_module_blames(self): logging.debug(u'build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) logging.debug(u'build blame cache') blame_cache = self.session.query(Blame).all() blame_cache = [x.file_commit for x in blame_cache] blame_cache = sorted(set(blame_cache)) logging.debug(u'eval module hashes') changed = False keys = sorted(self.modules.keys()) for k in keys: if k not in self.files: self.committers[k] = {} continue ghash = self.last_commit_for_file(k) if ghash in blame_cache: continue logging.debug(u'checking hash for {}'.format(k)) res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all() hashes = [x.file_commit for x in res] if ghash not in hashes: logging.debug(u'hash {} not found for {}, updating blames'.format(ghash, k)) scraper_args = [u'ansible', u'ansible', u'devel', k] uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*scraper_args) # check the emails for email, login in emailmap.items(): if email in self.emails_cache: continue exists = self.session.query(Email).filter_by(email=email).first() if not exists: logging.debug(u'insert {}:{}'.format(login, email)) _email = Email(email=email, login=login) self.session.add(_email) changed = True # check the blames for login, commits in uns.items(): for commit in commits: kwargs = { u'file_name': k, u'file_commit': ghash, u'author_commit': commit, u'author_login': login } exists = self.session.query(Blame).filter_by(**kwargs).first() if not exists: logging.debug(u'insert {}:{}:{}'.format(k, commit, login)) _blame = Blame(**kwargs) self.session.add(_blame) changed = True if changed: self.session.commit() logging.debug(u're-build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) # fill in what we can ... logging.debug(u'fill in commit logins') for k in keys: for idc, commit in enumerate(self.commits[k][:]): if not commit.get(u'login'): continue login = self.emails_cache.get(commit[u'email']) if not login and u'@users.noreply.github.com' in commit[u'email']: login = commit[u'email'].split(u'@')[0] self.emails_cache[commit[u'email']] = login if not login: print(u'unknown: {}'.format(commit[u'email'])) self.commits[k][idc][u'login'] = self.emails_cache.get(login) def get_emails_by_login(self, login): res = self.session.query(Email).filter_by(login=login) emails = [x.email for x in res.values()] return emails def _get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data for k in keys: cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue ghash = self.last_commit_for_file(k) pfile = os.path.join( self.scraper_cache, k.replace(u'/', u'_') + u'.blame.pickle' ) sargs = [u'ansible', u'ansible', u'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: logging.debug(u'load {}'.format(pfile)) with open(pfile, 'rb') as f: pdata = pickle_load(f) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() if pdata[0] == ghash: self.committers[k] = pdata[1] if len(pdata) == 3: # use emailmap if available emailmap = pdata[2] else: emailmap = {} else: refresh = True if refresh: if self.gqlc: logging.debug(u'graphql blame usernames {}'.format(pfile)) uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*sargs) else: emailmap = {} # scrapping: emails not available logging.debug(u'www blame usernames {}'.format(pfile)) uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle_dump((ghash, uns, emailmap), f) for email, github_id in emailmap.items(): if email not in self.emails_cache: self.emails_cache[email] = github_id # add scraped logins to the map for k in keys: for idx, x in enumerate(self.commits[k]): if x[u'email'] in [u'@']: continue if x[u'email'] not in self.emails_cache: self.emails_cache[x[u'email']] = None if x[u'login']: self.emails_cache[x[u'email']] = x[u'login'] continue xhash = x[u'hash'] for ck, cv in six.iteritems(self.committers[k]): if xhash in cv: self.emails_cache[x[u'email']] = ck break # fill in what we can ... for k in keys: for idx, x in enumerate(self.commits[k]): if not x[u'login']: if x[u'email'] in [u'@']: continue if self.emails_cache[x[u'email']]: login = self.emails_cache[x[u'email']] xhash = x[u'hash'] self.commits[k][idx][u'login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' # grep the authors: for k, v in six.iteritems(self.modules): if v[u'filepath'] is None: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) authors = self.get_module_authors(mfile) self.modules[k][u'authors'] = authors # authors are maintainers by -default- self.modules[k][u'maintainers'] += authors self.modules[k][u'maintainers'] = \ sorted(set(self.modules[k][u'maintainers'])) metadata = self.botmeta[u'files'].keys() for k, v in six.iteritems(self.modules): if k == u'meta': continue if k in self.botmeta[u'files']: # There are metadata in .github/BOTMETA.yml for this file # copy maintainers_keys self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][k][u'maintainers_keys'][:] if self.botmeta[u'files'][k]: maintainers = self.botmeta[u'files'][k].get(u'maintainers', []) for maintainer in maintainers: if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored if u'ignored' in self.botmeta[u'files'][k]: ignored = self.botmeta[u'files'][k][u'ignored'] for x in ignored: if x in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(x) else: # There isn't metadata in .github/BOTMETA.yml for this file best_match = None for mkey in metadata: if v[u'filepath'].startswith(mkey): if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k][u'maintainers_keys'] = [best_match] for maintainer in self.botmeta[u'files'][best_match].get(u'maintainers', []): if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored for ignored in self.botmeta[u'files'][best_match].get(u'ignored', []): if ignored in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(ignored) # save a pristine copy so that higher level code can still use it self.modules[k][u'maintainers'] = sorted(set(self.modules[k][u'maintainers'])) self.modules[k][u'_maintainers'] = \ [x for x in self.modules[k][u'maintainers']] # set the namespace maintainers ... for k, v in six.iteritems(self.modules): if u'namespace_maintainers' not in self.modules[k]: self.modules[k][u'namespace_maintainers'] = [] if v.get(u'namespace'): ns = v.get(u'namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k][u'namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace(u'lib/ansible/modules/', u'') path_parts = subpath.split(u'/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = u'/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { u'fulltopic': fulltopic, u'namespace': fulltopic, u'topic': topic, u'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = b'' inphase = False with io.open(module_file, 'rb') as f: for line in f: if b'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith((b"'''", b'"""')): break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = u'' doc_lines = to_text(documentation).split(u'\n') for idx, x in enumerate(doc_lines): if x.startswith(u'author'): inphase = True if inphase and not x.strip().startswith((u'-', u'author')): inphase = False break if inphase: author_lines += x + u'\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines, BotYAMLLoader) except Exception as e: print(e) return [] # quit early if the yaml was not valid if not ydata: return [] # quit if the key was not found if u'author' not in ydata: return [] if not isinstance(ydata[u'author'], list): ydata[u'author'] = [ydata[u'author']] authors = [] for author in ydata[u'author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors def extract_github_id(self, author): authors = set() if author is None: return [] if u'ansible core team' in author.lower(): authors.add(u'ansible') elif u'@' in author: # match github ids but not emails authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author)) elif u'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find(u'github.com/') author = author[idx+11:] authors.add(author.replace(u')', u'')) elif u'(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find(u'(') author = author[idx+1:] authors.add(author.replace(u')', u'')) # search for emails for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author): github_id = self.emails_cache.get(email) if github_id: authors.add(github_id) return list(authors) def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii')) ) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [x for x in known_modules if title.startswith(x + u' ')] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [x for x in known_modules if fnmatch.fnmatch(x, component)] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [x for x in cmatches if x in title_matches and x not in [u'at']] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split(u'\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith(u'*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split(u'\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith(u'*'): thiskey = line.replace(u'*', u'') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) if not mfile.endswith(u'.py'): # metadata is only the .py files ... ext = mfile.split(u'.')[-1] mfile = mfile.replace(u'.' + ext, u'.py', 1) self.modules[k][u'metadata'].update(self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = u'' inphase = False with io.open(module_file, 'r', encoding='utf-8') as f: for line in f: if line.startswith(u'ANSIBLE_METADATA'): inphase = True if line.startswith(u'DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) tmp_meta = {} for k, v in meta.items(): if isinstance(k, six.binary_type): k = to_text(k) if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, list): tmp_list = [] for i in v: if isinstance(i, six.binary_type): i = to_text(i) tmp_list.append(i) v = tmp_list del tmp_list tmp_meta[k] = v meta = tmp_meta del tmp_meta except SyntaxError: pass return meta def set_module_imports(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) self.modules[k][u'imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): mimports = [] if not os.path.isfile(module_file): return mimports else: with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(b',', b'') if line.startswith(b'import') or \ (b'import' in line and b'from' in line): lparts = line.split() if line.startswith(b'import '): mimports.append(lparts[1]) elif line.startswith(b'from '): mpath = lparts[1] + b'.' for spath in lparts[3:]: mimports.append(mpath + spath) return [to_text(m) for m in mimports] @property def all_maintainers(self): maintainers = set() for path, metadata in self.botmeta[u'files'].items(): maintainers.update(metadata.get(u'maintainers', [])) return maintainers @property def all_authors(self): authors = set() for key, metadata in self.modules.items(): authors.update(metadata.get(u'authors', [])) return authors def get_maintainers_for_namespace(self, namespace): maintainers = [] for k, v in self.modules.items(): if u'namespace' not in v or u'maintainers' not in v: continue if v[u'namespace'] == namespace: for m in v[u'maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != u'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist def get_file_content(self, filepath): fpath = os.path.join(self.gitrepo.checkoutdir, filepath) if not os.path.isfile(fpath): return None with io.open(fpath, 'r', encoding='utf-8') as f: data = f.read() return data
def _collect_repo(self, repo, issuenums=None): '''Collect issues for an individual repo''' logging.info('getting repo obj for %s' % repo) if repo not in self.repos: gitrepo = GitRepoWrapper( cachedir=self.cachedir_base, repo=f'https://github.com/{repo}', commit=self.args.ansible_commit, ) self.repos[repo] = { 'repo': self.ghw.get_repo(repo), 'issues': [], 'processed': [], 'since': None, 'stale': [], 'loopcount': 0, 'labels': self.ghw.get_valid_labels(repo), 'gitrepo': gitrepo, } else: # force a clean repo object to limit caching problems logging.info('updating repo') self.repos[repo]['repo'] = self.ghw.get_repo(repo) logging.info('updating checkout') self.repos[repo]['gitrepo'].update() # clear the issues self.repos[repo]['issues'] = {} # increment the loopcount self.repos[repo]['loopcount'] += 1 logging.info('getting issue objs for %s' % repo) self.update_issue_summaries(repopath=repo, issuenums=issuenums) issuecache = {} numbers = self.issue_summaries[repo].keys() numbers = {int(x) for x in numbers} if issuenums: numbers.intersection_update(issuenums) numbers = list(numbers) logging.info('%s known numbers' % len(numbers)) if self.args.daemonize: if not self.repos[repo]['since']: ts = [ x[1]['updated_at'] for x in self.issue_summaries[repo].items() if x[1]['updated_at'] ] ts += [ x[1]['created_at'] for x in self.issue_summaries[repo].items() if x[1]['created_at'] ] ts = sorted(set(ts)) if ts: self.repos[repo]['since'] = ts[-1] else: since = strip_time_safely(self.repos[repo]['since']) api_since = self.repos[repo]['repo'].get_issues(since=since) numbers = [] for x in api_since: numbers.append(x.number) issuecache[x.number] = x numbers = sorted({int(n) for n in numbers}) logging.info('%s numbers after [api] since == %s' % (len(numbers), since)) for k, v in self.issue_summaries[repo].items(): if v['created_at'] is None: # issue is closed and was never processed continue if v['created_at'] > self.repos[repo]['since']: numbers.append(k) numbers = sorted({int(n) for n in numbers}) logging.info('%s numbers after [www] since == %s' % (len(numbers), since)) if self.args.start_at and self.repos[repo]['loopcount'] == 0: numbers = [x for x in numbers if x <= self.args.start_at] logging.info('%s numbers after start-at' % len(numbers)) # Get stale numbers if not targeting if self.args.daemonize and self.repos[repo]['loopcount'] > 0: logging.info('checking for stale numbers') stale = self.get_stale_numbers(repo) self.repos[repo]['stale'] = [int(x) for x in stale] numbers += [int(x) for x in stale] numbers = sorted(set(numbers)) logging.info('%s numbers after stale check' % len(numbers)) ################################################################ # PRE-FILTERING TO PREVENT EXCESSIVE API CALLS ################################################################ # filter just the open numbers if not self.args.only_closed and not self.args.ignore_state: numbers = [ x for x in numbers if (to_text(x) in self.issue_summaries[repo] and self.issue_summaries[repo][to_text(x)]['state'] == 'open') ] logging.info('%s numbers after checking state' % len(numbers)) # filter by type if self.args.only_issues: numbers = [ x for x in numbers if self.issue_summaries[repo][to_text(x)]['type'] == 'issue' ] logging.info('%s numbers after checking type' % len(numbers)) elif self.args.only_prs: numbers = [ x for x in numbers if self.issue_summaries[repo][to_text(x)] ['type'] == 'pullrequest' ] logging.info('%s numbers after checking type' % len(numbers)) numbers = sorted({int(x) for x in numbers}) if self.args.sort == 'desc': numbers = [x for x in reversed(numbers)] if self.args.last and len(numbers) > self.args.last: numbers = numbers[0 - self.args.last:] # Use iterator to avoid requesting all issues upfront self.repos[repo]['issues'] = RepoIssuesIterator( self.repos[repo]['repo'], numbers, issuecache=issuecache) logging.info('getting repo objs for %s complete' % repo)
def __init__(self, repo, cachedir='/tmp'): self.cachedir = cachedir if not os.path.isdir(self.cachedir): os.makedirs(self.cachedir) self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=repo)