Example #1
0
    def __init__(self,
                 commits=True,
                 blames=True,
                 botmeta=None,
                 botmetafile=None,
                 maintainers=None,
                 gh_client=None,
                 cachedir=u'~/.ansibullbot/cache',
                 gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        if botmeta:
            self.botmeta = botmeta
        else:
            self.botmeta = {
            }  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(
                cachedir=cachedir, repo=u'https://github.com/ansible/ansible')

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)
Example #2
0
def main():

    if len(sys.argv) != 2:
        usage()
        sys.exit(1)

    ec = EmailCache()
    cm = AnsibleComponentMatcher(gitrepo=GitRepoWrapper(cachedir=CACHEDIR,
                                                        repo=REPO),
                                 botmetafile=None,
                                 cachedir=CACHEDIR,
                                 email_cache=ec)
    cm.update()

    # This gets the rendered meta for just modules ...
    #meta = cm.BOTMETA
    #print(json.dumps(cm.BOTMETA, indent=2, sort_keys=True))

    # This is how the bot gets full meta for a file ...
    FULLMETA = {}
    for filen in cm.gitrepo.files:
        FULLMETA[filen] = cm.get_meta_for_file(filen)

    with open(sys.argv[1], 'w') as f:
        f.write(json.dumps(FULLMETA, indent=2, sort_keys=True))
Example #3
0
    def get_repo_for_collection(self, fqcn):
        today = datetime.datetime.now()

        if fqcn not in self._gitrepos:

            # reduce the number of requests ...
            try:
                rurl = self._checkout_index.get(fqcn, {}).get('url')
            except AttributeError as e:
                print(e)
                import epdb
                epdb.st()

            if rurl is None:
                # https://galaxy.ansible.com/api/v2/collections/devoperate/base/
                curl = self._baseurl + '/api/v2/collections/' + fqcn.replace(
                    '.', '/') + '/'
                rr = requests.get(curl)
                jdata = rr.json()
                vurl = jdata['latest_version']['href']
                rr2 = requests.get(vurl)
                jdata2 = rr2.json()
                rurl = jdata2.get('metadata', {}).get('repository')

            # reduce the number of clones and rebases ...
            needs_rebase = False
            if fqcn not in self._checkout_index:
                needs_rebase = True
            elif not self._checkout_index.get(fqcn, {}).get('checkout'):
                needs_rebase = True
            elif not self._checkout_index.get(fqcn, {}).get('updated'):
                needs_rebase = True
            elif (today - self._checkout_index[fqcn]['updated']).days > 0:
                needs_rebase = True

            logging.info('checkout %s -> %s' % (fqcn, rurl))
            grepo = GitRepoWrapper(cachedir=self.cachedir,
                                   repo=rurl,
                                   rebase=needs_rebase)
            self._gitrepos[fqcn] = grepo

            # keep the last updated time if not rebased ...
            if needs_rebase:
                updated = datetime.datetime.now()
            else:
                updated = self._checkout_index[fqcn]['updated']

            self._checkout_index[fqcn] = {
                'url': rurl,
                'fqcn': fqcn,
                'checkout': grepo.checkoutdir,
                'updated': updated
            }
            self._save_checkout_index()

        return self._gitrepos[fqcn]
Example #4
0
 def __init__(self):
     self.cachedir = '/tmp/ansibot.cache'
     self.gitrepo = GitRepoWrapper(
         cachedir=self.cachedir,
         repo='https://github.com/ansible/ansible',
         commit='a76d78f6919f62698341be2f102297a2ce30897c')
     self.component_matcher = AnsibleComponentMatcher(
         usecache=True,
         gitrepo=self.gitrepo,
         cachedir='/tmp/ansibot.cache.components',
         email_cache={})
Example #5
0
    def __init__(self, gitrepo=None, botmetafile=None, cachedir=None, email_cache=None, file_indexer=None):
        self.cachedir = cachedir
        self.botmetafile = botmetafile
        self.email_cache = email_cache

        if file_indexer:
            self.file_indexer = file_indexer
        else:
            self.file_indexer = FileIndexer(
                botmetafile=self.botmetafile,
                checkoutdir=self.cachedir
            )

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=self.REPO)

        self.strategy = None
        self.strategies = []

        self.indexed_at = False
        self.updated_at = None
        self.update()
Example #6
0
    def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        self.botmeta = {}  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible')

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)
Example #7
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        u'authors': [],
        u'name': None,
        u'namespaced_module': None,
        u'namespace_maintainers': [],
        u'deprecated': False,
        u'deprecated_filename': None,
        u'dirpath': None,
        u'filename': None,
        u'filepath': None,
        u'fulltopic': None,
        u'maintainers': [],
        u'_maintainers': [],
        u'maintainers_keys': None,
        u'metadata': {},
        u'repo_filename': None,
        u'repository': u'ansible',
        u'subtopic': None,
        u'topic': None,
        u'imports': []
    }

    def __init__(self,
                 commits=True,
                 blames=True,
                 botmetafile=None,
                 maintainers=None,
                 gh_client=None,
                 cachedir=u'~/.ansibullbot/cache',
                 gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        self.botmeta = {
        }  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(
                cachedir=cachedir,
                repo=u'https://github.com/ansible-collections/community.general'
            )

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)

    def update(self, force=False):
        '''Reload everything if there are new commits'''
        changed = self.gitrepo.manage_checkout()
        if changed or force:
            self.get_files()
            self.parse_metadata()

    def get_files(self):
        '''Cache a list of filenames in the checkout'''
        cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir)
        (rc, so, se) = run_command(cmd)
        files = to_text(so).split(u'\n')
        files = [x.strip() for x in files if x.strip()]
        self.files = files

    def parse_metadata(self):

        if self.botmetafile is not None:
            with open(self.botmetafile, 'rb') as f:
                rdata = f.read()
        else:
            fp = u'.github/BOTMETA.yml'
            rdata = self.get_file_content(fp)
        self.botmeta = BotMetadataParser.parse_yaml(rdata)

        # load the modules
        logging.info(u'loading modules')
        self.get_ansible_modules()

    def _find_match(self, pattern, exact=False):

        logging.debug(u'exact:{} matching on {}'.format(exact, pattern))

        matches = []

        if isinstance(pattern, six.text_type):
            pattern = to_text(to_bytes(pattern, 'ascii', 'ignore'), 'ascii')

        for k, v in six.iteritems(self.modules):
            if v[u'name'] == pattern:
                logging.debug(u'match {} on name: {}'.format(k, v[u'name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in six.iteritems(self.modules):
                if k == pattern:
                    logging.debug(u'match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        if not matches and not exact:
            # search by properties
            for k, v in six.iteritems(self.modules):
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        logging.debug(u'match {} on subkey: {}'.format(
                            k, subkey))
                        matches.append(v)

        if not matches and not exact:
            # Levenshtein distance should workaround most typos
            distance_map = {}
            for k, v in six.iteritems(self.modules):
                mname = v.get(u'name')
                if not mname:
                    continue
                if isinstance(mname, six.text_type):
                    mname = to_text(to_bytes(mname, 'ascii', 'ignore'),
                                    'ascii')
                try:
                    res = Levenshtein.distance(pattern, mname)
                except TypeError as e:
                    logging.error(e)
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb
                        epdb.st()
                distance_map[mname] = [res, k]
            res = sorted(distance_map.items(),
                         key=lambda x: x[1],
                         reverse=True)
            if len(pattern) > 3 > res[-1][1]:
                logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(
                    res[-1][-1], res[-1][0], pattern))
                matches = [self.modules[res[-1][-1]]]

        return matches

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''

        logging.debug(u'find_match for "{}"'.format(pattern))

        BLACKLIST = [
            u'module_utils', u'callback', u'network modules',
            u'networking modules'
            u'windows modules'
        ]

        if not pattern or pattern is None:
            return None

        if pattern.lower() == u'core':
            return None
        '''
        if 'docs.ansible.com' in pattern and '_module.html' in pattern:
            # http://docs.ansible.com/ansible/latest/copy_module.html
            # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html
            # http://docs.ansible.com/ansible/latest/postgresql_db_module.html
            # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html)
            # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html
            # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html)
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                pattern
            )
            #urls = [x for x in urls if '_module.html' in x]
            #if urls:
            #    import epdb; epdb.st()
            import epdb; epdb.st()
        '''

        # https://github.com/ansible/ansible/issues/19755
        if pattern == u'setup':
            pattern = u'system/setup.py'

        if u'/facts.py' in pattern or u' facts.py' in pattern:
            pattern = u'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if u'-' in pattern:
            pattern = pattern.replace(u'-', u'_')

        if u'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif u'callback' in pattern:
            return None
        elif u'lookup' in pattern:
            return None
        elif u'contrib' in pattern and u'inventory' in pattern:
            return None
        elif pattern.lower() in BLACKLIST:
            return None
        elif u'/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            # FIXME what's this for?
            if not pattern.startswith(u'plugins/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + u'.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith(u'.py') and self._find_match(pattern,
                                                           exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)

            if isinstance(candidate, list):
                if len(candidate) == 1:
                    candidate = candidate[0]

            if candidate[u'filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match(u'_' + bname)

        # unique the results
        if isinstance(match, list) and len(match) > 1:
            _match = []
            for m in match:
                if m not in _match:
                    _match.append(m)
            match = _match[:]

        return match

    def is_valid(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return match[u'repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        matches = []
        module_dir = os.path.join(self.gitrepo.checkoutdir, u'plugins/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, _, filenames in os.walk(module_dir):
            for filename in filenames:
                if u'plugins/modules' in root and not filename == u'__init__.py':
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        self.populate_modules(matches)

        # custom fixes
        newitems = []
        for k, v in six.iteritems(self.modules):

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include.py'):
                self.modules[k][u'repository'] = u'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include_vars.py'):
                self.modules[k][u'repository'] = u'ansible'
            if k.endswith(u'/include_role.py'):
                self.modules[k][u'repository'] = u'ansible'

            # ansible maintains these
            if u'include' in k:
                self.modules[k][u'maintainers'] = [u'ansible']

            # deprecated modules are annoying
            if v[u'name'].startswith(u'_'):

                dkey = os.path.dirname(v[u'filepath'])
                dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd[u'name'] = nd[u'name'].replace(u'_', u'', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        logging.debug(u'set module metadata')
        self.set_module_metadata()

        # parse imports
        logging.debug(u'set module imports')
        self.set_module_imports()

        # last modified
        if self.get_commits:
            logging.debug(u'set module commits')
            self.get_module_commits()

        # parse blame
        if self.get_blames and self.get_commits:
            logging.debug(u'set module blames')
            self.get_module_blames()

        # depends on metadata now ...
        logging.debug(u'set module maintainers')
        self.set_maintainers()

        return self.modules

    def populate_modules(self, matches):
        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict[u'filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'dirpath'] = dirpath

            filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'filepath'] = filepath

            mdict.update(self.split_topics_from_path(filepath))

            mdict[u'repo_filename'] = mdict[u'filepath']\
                .replace(u'plugins/modules/%s/' % mdict[u'repository'], u'')

            # clustering/consul
            mdict[u'namespaced_module'] = mdict[u'repo_filename']
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.py', u'')
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.ps1', u'')

            mname = os.path.basename(match)
            mname = mname.replace(u'.py', u'')
            mname = mname.replace(u'.ps1', u'')
            mdict[u'name'] = mname

            # deprecated modules
            if mname.startswith(u'_'):
                mdict[u'deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict[u'namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + u'.py')
                mdict[u'deprecated_filename'] = deprecated_filename
            else:
                mdict[u'deprecated_filename'] = mdict[u'repo_filename']

            self.modules[filepath] = mdict

        # meta is a special module
        self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules[u'meta'][u'name'] = u'meta'
        self.modules[u'meta'][u'repo_filename'] = u'meta'

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            self.commits[k] = []
            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(self.scraper_cache,
                                 k.replace(u'/', u'_') + u'.commits.pickle')

            if not os.path.isfile(pfile):
                refresh = True
            else:
                pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {}
                print(pfile)
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f, **pickle_kwargs)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info(u'refresh commit cache for %s' % k)
                cmd = u'cd %s; git log --follow %s' % (
                    self.gitrepo.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in to_text(so).split(u'\n'):
                    if line.startswith(u'commit '):
                        commit = {
                            u'name': None,
                            u'email': None,
                            u'login': None,
                            u'hash': line.split()[-1],
                            u'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith(u'Author: '):
                        line = line.replace(u'Author: ', u'')
                        line = line.replace(u'<', u'')
                        line = line.replace(u'>', u'')
                        lparts = line.split()

                        if u'@' in lparts[-1]:
                            commit[u'email'] = lparts[-1]
                            commit[u'name'] = u' '.join(lparts[:-1])
                        else:
                            pass

                        if commit[u'email'] and \
                                u'noreply.github.com' in commit[u'email']:
                            commit[u'login'] = commit[u'email'].split(u'@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith(u'Date:'):
                        dstr = line.split(u':', 1)[1].strip()
                        dstr = u' '.join(dstr.split(u' ')[:-1])
                        ds = datetime.datetime.strptime(
                            to_text(dstr), u'%a %b %d %H:%M:%S %Y')
                        commit[u'date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle_dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        if filepath in self.commits and u'hash' in self.commits[filepath][0]:
            return self.commits[filepath][0][u'hash']

        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.gitrepo.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        return to_text(so).strip()

    def get_module_blames(self):

        logging.debug(u'build email cache')
        emails_cache = self.session.query(Email)
        emails_cache = [(x.email, x.login) for x in emails_cache]
        self.emails_cache = dict(emails_cache)

        logging.debug(u'build blame cache')
        blame_cache = self.session.query(Blame).all()
        blame_cache = [x.file_commit for x in blame_cache]
        blame_cache = sorted(set(blame_cache))

        logging.debug(u'eval module hashes')
        changed = False
        keys = sorted(self.modules.keys())
        for k in keys:
            if k not in self.files:
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)

            if ghash in blame_cache:
                continue

            logging.debug(u'checking hash for {}'.format(k))
            res = self.session.query(Blame).filter_by(file_name=k,
                                                      file_commit=ghash).all()
            hashes = [x.file_commit for x in res]

            if ghash not in hashes:

                logging.debug(
                    u'hash {} not found for {}, updating blames'.format(
                        ghash, k))

                scraper_args = [u'ansible', u'ansible', u'devel', k]
                uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                    *scraper_args)

                # check the emails
                for email, login in emailmap.items():
                    if email in self.emails_cache:
                        continue
                    exists = self.session.query(Email).filter_by(
                        email=email).first()
                    if not exists:
                        logging.debug(u'insert {}:{}'.format(login, email))
                        _email = Email(email=email, login=login)
                        self.session.add(_email)
                        changed = True

                # check the blames
                for login, commits in uns.items():
                    for commit in commits:
                        kwargs = {
                            u'file_name': k,
                            u'file_commit': ghash,
                            u'author_commit': commit,
                            u'author_login': login
                        }
                        exists = self.session.query(Blame).filter_by(
                            **kwargs).first()
                        if not exists:
                            logging.debug(u'insert {}:{}:{}'.format(
                                k, commit, login))
                            _blame = Blame(**kwargs)
                            self.session.add(_blame)
                            changed = True

        if changed:
            self.session.commit()
            logging.debug(u're-build email cache')
            emails_cache = self.session.query(Email)
            emails_cache = [(x.email, x.login) for x in emails_cache]
            self.emails_cache = dict(emails_cache)

        # fill in what we can ...
        logging.debug(u'fill in commit logins')
        for k in keys:
            for idc, commit in enumerate(self.commits[k][:]):
                if not commit.get(u'login'):
                    continue
                login = self.emails_cache.get(commit[u'email'])
                if not login and u'@users.noreply.github.com' in commit[
                        u'email']:
                    login = commit[u'email'].split(u'@')[0]
                    self.emails_cache[commit[u'email']] = login
                if not login:
                    print(u'unknown: {}'.format(commit[u'email']))
                self.commits[k][idc][u'login'] = self.emails_cache.get(login)

    def get_emails_by_login(self, login):
        res = self.session.query(Email).filter_by(login=login)
        emails = [x.email for x in res.values()]
        return emails

    def _get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        for k in keys:

            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(self.scraper_cache,
                                 k.replace(u'/', u'_') + u'.blame.pickle')
            sargs = [u'ansible', u'ansible', u'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                logging.debug(u'load {}'.format(pfile))
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f)
                if C.DEFAULT_BREAKPOINTS:
                    logging.error(u'breakpoint!')
                    import epdb
                    epdb.st()
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                    if len(pdata) == 3:
                        # use emailmap if available
                        emailmap = pdata[2]
                    else:
                        emailmap = {}
                else:
                    refresh = True

            if refresh:
                if self.gqlc:
                    logging.debug(u'graphql blame usernames {}'.format(pfile))
                    uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                        *sargs)
                else:
                    emailmap = {}  # scrapping: emails not available
                    logging.debug(u'www blame usernames {}'.format(pfile))
                    uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle_dump((ghash, uns, emailmap), f)

            for email, github_id in emailmap.items():
                if email not in self.emails_cache:
                    self.emails_cache[email] = github_id

        # add scraped logins to the map
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if x[u'email'] in [u'@']:
                    continue
                if x[u'email'] not in self.emails_cache:
                    self.emails_cache[x[u'email']] = None
                if x[u'login']:
                    self.emails_cache[x[u'email']] = x[u'login']
                    continue

                xhash = x[u'hash']
                for ck, cv in six.iteritems(self.committers[k]):
                    if xhash in cv:
                        self.emails_cache[x[u'email']] = ck
                        break

        # fill in what we can ...
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if not x[u'login']:
                    if x[u'email'] in [u'@']:
                        continue
                    if self.emails_cache[x[u'email']]:
                        login = self.emails_cache[x[u'email']]
                        xhash = x[u'hash']
                        self.commits[k][idx][u'login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''

        # grep the authors:
        for k, v in six.iteritems(self.modules):
            if v[u'filepath'] is None:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k][u'authors'] = authors

            # authors are maintainers by -default-
            self.modules[k][u'maintainers'] += authors
            self.modules[k][u'maintainers'] = \
                sorted(set(self.modules[k][u'maintainers']))

        metadata = self.botmeta[u'files'].keys()
        for k, v in six.iteritems(self.modules):
            if k == u'meta':
                continue

            if k in self.botmeta[u'files']:
                # There are metadata in .github/BOTMETA.yml for this file
                # copy maintainers_keys
                self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][
                    k][u'maintainers_keys'][:]

                if self.botmeta[u'files'][k]:
                    maintainers = self.botmeta[u'files'][k].get(
                        u'maintainers', [])

                    for maintainer in maintainers:
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    if u'ignored' in self.botmeta[u'files'][k]:
                        ignored = self.botmeta[u'files'][k][u'ignored']
                        for x in ignored:
                            if x in self.modules[k][u'maintainers']:
                                self.modules[k][u'maintainers'].remove(x)

            else:
                # There isn't metadata in .github/BOTMETA.yml for this file
                best_match = None
                for mkey in metadata:
                    if v[u'filepath'].startswith(mkey):
                        if not best_match:
                            best_match = mkey
                            continue
                        if len(mkey) > len(best_match):
                            best_match = mkey
                if best_match:
                    self.modules[k][u'maintainers_keys'] = [best_match]
                    for maintainer in self.botmeta[u'files'][best_match].get(
                            u'maintainers', []):
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    for ignored in self.botmeta[u'files'][best_match].get(
                            u'ignored', []):
                        if ignored in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].remove(ignored)

            # save a pristine copy so that higher level code can still use it
            self.modules[k][u'maintainers'] = sorted(
                set(self.modules[k][u'maintainers']))
            self.modules[k][u'_maintainers'] = \
                [x for x in self.modules[k][u'maintainers']]

        # set the namespace maintainers ...
        for k, v in six.iteritems(self.modules):
            if u'namespace_maintainers' not in self.modules[k]:
                self.modules[k][u'namespace_maintainers'] = []
            if v.get(u'namespace'):
                ns = v.get(u'namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k][u'namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace(u'plugins/modules/', u'')
        path_parts = subpath.split(u'/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = u'/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            u'fulltopic': fulltopic,
            u'namespace': fulltopic,
            u'topic': topic,
            u'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        if not os.path.exists(module_file):
            return []

        documentation = b''
        inphase = False

        with io.open(module_file, 'rb') as f:
            for line in f:
                if b'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith((b"'''", b'"""')):
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return []

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = u''
        doc_lines = to_text(documentation).split(u'\n')
        for idx, x in enumerate(doc_lines):
            if x.startswith(u'author'):
                inphase = True
            if inphase and not x.strip().startswith((u'-', u'author')):
                inphase = False
                break
            if inphase:
                author_lines += x + u'\n'

        if not author_lines:
            return []

        ydata = {}
        try:
            ydata = yaml.load(author_lines, BotYAMLLoader)
        except Exception as e:
            print(e)
            return []

        # quit early if the yaml was not valid
        if not ydata:
            return []

        # quit if the key was not found
        if u'author' not in ydata:
            return []

        if not isinstance(ydata[u'author'], list):
            ydata[u'author'] = [ydata[u'author']]

        authors = []
        for author in ydata[u'author']:
            github_ids = self.extract_github_id(author)
            if github_ids:
                authors.extend(github_ids)
        return authors

    def extract_github_id(self, author):
        authors = set()

        if author is None:
            return []
        if u'ansible core team' in author.lower():
            authors.add(u'ansible')
        elif u'@' in author:
            # match github ids but not emails
            authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author))
        elif u'github.com/' in author:
            # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
            idx = author.find(u'github.com/')
            author = author[idx + 11:]
            authors.add(author.replace(u')', u''))
        elif u'(' in author and len(author.split()) == 3:
            # Mathieu Bultel (matbu)
            idx = author.find(u'(')
            author = author[idx + 1:]
            authors.add(author.replace(u')', u''))

        # search for emails
        for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author):
            github_id = self.emails_cache.get(email)
            if github_id:
                authors.add(github_id)

        return list(authors)

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        logging.debug(u'fuzzy match {}'.format(
            to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii')))

        if component.lower() == u'core':
            return None

        # https://github.com/ansible/ansible/issues/18179
        if u'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if u'module_utils' in component:
            return None

        if u'new module' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith(u's'):
            tm = self.find_match(component[:-1])
            if tm:
                if not isinstance(tm, list):
                    return tm[u'name']
                elif len(tm) == 1:
                    return tm[0][u'name']
                else:
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb
                        epdb.st()

        match = None
        known_modules = []

        for k, v in six.iteritems(self.modules):
            if v[u'name'] in [u'include']:
                continue
            known_modules.append(v[u'name'])

        title = title.lower()
        title = title.replace(u':', u'')
        title_matches = [x for x in known_modules if x + u' module' in title]

        if not title_matches:
            title_matches = [
                x for x in known_modules if title.startswith(x + u' ')
            ]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if u' ' + x + u' ' in title]

            if title_matches:
                title_matches = [x for x in title_matches if x != u'at']

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not u'_' + x in component]

        # globs
        if not cmatches and u'*' in component:
            fmatches = [
                x for x in known_modules if fnmatch.fnmatch(x, component)
            ]
            if fmatches:
                cmatches = fmatches[:]

        if title_matches:
            # use title ... ?
            cmatches = [
                x for x in cmatches if x in title_matches and x not in [u'at']
            ]

        if cmatches:
            if len(cmatches) >= 1 and (u'*' not in component
                                       and u'modules' not in component):
                match = cmatches[0]
            else:
                match = cmatches[:]
            if not match:
                if u'docs.ansible.com' in component:
                    pass
                else:
                    pass
            logging.debug("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                logging.debug("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split(u'\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith(u'*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split(u'\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith(u'*'):
                thiskey = line.replace(u'*', u'')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            if not mfile.endswith(u'.py'):
                # metadata is only the .py files ...
                ext = mfile.split(u'.')[-1]
                mfile = mfile.replace(u'.' + ext, u'.py', 1)

            self.modules[k][u'metadata'].update(
                self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = u''
        inphase = False
        with io.open(module_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith(u'ANSIBLE_METADATA'):
                    inphase = True
                if line.startswith(u'DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
            tmp_meta = {}
            for k, v in meta.items():
                if isinstance(k, six.binary_type):
                    k = to_text(k)
                if isinstance(v, six.binary_type):
                    v = to_text(v)
                if isinstance(v, list):
                    tmp_list = []
                    for i in v:
                        if isinstance(i, six.binary_type):
                            i = to_text(i)
                        tmp_list.append(i)
                    v = tmp_list
                    del tmp_list
                tmp_meta[k] = v
            meta = tmp_meta
            del tmp_meta
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            self.modules[k][u'imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):
        mimports = []

        if not os.path.isfile(module_file):
            return mimports

        else:
            with open(module_file, 'rb') as f:
                for line in f:
                    line = line.strip()
                    line = line.replace(b',', b'')
                    if line.startswith(b'import') or \
                            (b'import' in line and b'from' in line):
                        lparts = line.split()
                        if line.startswith(b'import '):
                            mimports.append(lparts[1])
                        elif line.startswith(b'from '):
                            mpath = lparts[1] + b'.'
                            for spath in lparts[3:]:
                                mimports.append(mpath + spath)

            return [to_text(m) for m in mimports]

    @property
    def all_maintainers(self):
        maintainers = set()
        for path, metadata in self.botmeta[u'files'].items():
            maintainers.update(metadata.get(u'maintainers', []))
        return maintainers

    @property
    def all_authors(self):
        authors = set()
        for key, metadata in self.modules.items():
            authors.update(metadata.get(u'authors', []))
        return authors

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k, v in self.modules.items():
            if u'namespace' not in v or u'maintainers' not in v:
                continue
            if v[u'namespace'] == namespace:
                for m in v[u'maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != u'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist

    def get_file_content(self, filepath):
        fpath = os.path.join(self.gitrepo.checkoutdir, filepath)
        if not os.path.isfile(fpath):
            return None
        with io.open(fpath, 'r', encoding='utf-8') as f:
            data = f.read()
        return data
Example #8
0
class AnsibleComponentMatcher(object):

    BOTMETA = {}
    INDEX = {}
    REPO = 'https://github.com/ansible/ansible'
    STOPWORDS = ['ansible', 'core', 'plugin']
    STOPCHARS = ['"', "'", '(', ')', '?', '*', '`', ',', ':', '?', '-']
    BLACKLIST = ['new module', 'new modules']
    FILE_NAMES = []
    MODULES = {}
    MODULE_NAMES = []
    MODULE_NAMESPACE_DIRECTORIES = []

    # FIXME: THESE NEED TO GO INTO BOTMETA
    # ALSO SEE search_by_regex_generic ...
    KEYWORDS = {
        'all': None,
        'ansiballz': 'lib/ansible/executor/module_common.py',
        'ansible-console': 'lib/ansible/cli/console.py',
        'ansible-galaxy': 'lib/ansible/galaxy',
        'ansible-inventory': 'lib/ansible/cli/inventory.py',
        'ansible-playbook': 'lib/ansible/playbook',
        'ansible playbook': 'lib/ansible/playbook',
        'ansible playbooks': 'lib/ansible/playbook',
        'ansible-pull': 'lib/ansible/cli/pull.py',
        'ansible-vault': 'lib/ansible/parsing/vault',
        'ansible-vault edit': 'lib/ansible/parsing/vault',
        'ansible-vault show': 'lib/ansible/parsing/vault',
        'ansible-vault decrypt': 'lib/ansible/parsing/vault',
        'ansible-vault encrypt': 'lib/ansible/parsing/vault',
        'async': 'lib/ansible/modules/utilities/logic/async_wrapper.py',
        'become': 'lib/ansible/playbook/become.py',
        'block': 'lib/ansible/playbook/block.py',
        'blocks': 'lib/ansible/playbook/block.py',
        'callback plugin': 'lib/ansible/plugins/callback',
        'callback plugins': 'lib/ansible/plugins/callback',
        'conditional': 'lib/ansible/playbook/conditional.py',
        'docs': 'docs',
        'delegate_to': 'lib/ansible/playbook/task.py',
        'facts': 'lib/ansible/module_utils/facts',
        'galaxy': 'lib/ansible/galaxy',
        'groupvars': 'lib/ansible/vars/hostvars.py',
        'group vars': 'lib/ansible/vars/hostvars.py',
        'handlers': 'lib/ansible/playbook/handler.py',
        'hostvars': 'lib/ansible/vars/hostvars.py',
        'host vars': 'lib/ansible/vars/hostvars.py',
        'integration tests': 'test/integration',
        'inventory script': 'contrib/inventory',
        'jinja2 template system': 'lib/ansible/template',
        'module_utils': 'lib/ansible/module_utils',
        'multiple modules': None,
        'new module(s) request': None,
        'new modules request': None,
        'new module request': None,
        'new module': None,
        'network_cli': 'lib/ansible/plugins/connection/network_cli.py',
        'network_cli.py': 'lib/ansible/plugins/connection/network_cli.py',
        'network modules': 'lib/ansible/modules/network',
        'paramiko': 'lib/ansible/plugins/connection/paramiko_ssh.py',
        'role': 'lib/ansible/playbook/role',
        'roles': 'lib/ansible/playbook/role',
        'ssh': 'lib/ansible/plugins/connection/ssh.py',
        'ssh authentication': 'lib/ansible/plugins/connection/ssh.py',
        'setup / facts': 'lib/ansible/modules/system/setup.py',
        'setup': 'lib/ansible/modules/system/setup.py',
        'task executor': 'lib/ansible/executor/task_executor.py',
        'testing': 'test/',
        'validate-modules': 'test/sanity/validate-modules',
        'vault': 'lib/ansible/parsing/vault',
        'vault edit': 'lib/ansible/parsing/vault',
        'vault documentation': 'lib/ansible/parsing/vault',
        'with_items': 'lib/ansible/playbook/loop_control.py',
        'windows modules': 'lib/ansible/modules/windows',
        'winrm': 'lib/ansible/plugins/connection/winrm.py'
    }

    def __init__(self, gitrepo=None, botmetafile=None, cachedir=None, email_cache=None, file_indexer=None):
        self.cachedir = cachedir
        self.botmetafile = botmetafile
        self.email_cache = email_cache

        if file_indexer:
            self.file_indexer = file_indexer
        else:
            self.file_indexer = FileIndexer(
                botmetafile=self.botmetafile,
                checkoutdir=self.cachedir
            )

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=self.REPO)

        self.strategy = None
        self.strategies = []

        self.indexed_at = False
        self.updated_at = None
        self.update()

    def update(self, email_cache=None):
        if email_cache:
            self.email_cache = email_cache
        self.gitrepo.update()
        self.index_files()
        self.indexed_at = datetime.datetime.now()
        self.cache_keywords()
        self.updated_at = datetime.datetime.now()

    def index_files(self):

        self.BOTMETA = {}
        self.MODULES = {}
        self.MODULE_NAMES = []
        self.MODULE_NAMESPACE_DIRECTORIES = []

        self.load_meta()

        for fn in self.gitrepo.module_files:
            if os.path.isdir(fn):
                continue
            mname = os.path.basename(fn)
            mname = mname.replace('.py', '').replace('.ps1', '')
            if mname.startswith('__'):
                continue
            mdata = {
                'name': mname,
                'repo_filename': fn,
                'filename': fn
            }
            if fn not in self.MODULES:
                self.MODULES[fn] = mdata.copy()
            else:
                self.MODULES[fn].update(mdata)

        self.MODULE_NAMESPACE_DIRECTORIES = [os.path.dirname(x) for x in self.gitrepo.module_files]
        self.MODULE_NAMESPACE_DIRECTORIES = sorted(set(self.MODULE_NAMESPACE_DIRECTORIES))

        # make a list of names by enumerating the files
        self.MODULE_NAMES = [os.path.basename(x) for x in self.gitrepo.module_files]
        self.MODULE_NAMES = [x for x in self.MODULE_NAMES if x.endswith('.py') or x.endswith('.ps1')]
        self.MODULE_NAMES = [x.replace('.ps1', '').replace('.py', '') for x in self.MODULE_NAMES]
        self.MODULE_NAMES = [x for x in self.MODULE_NAMES if not x.startswith('__')]
        self.MODULE_NAMES = sorted(set(self.MODULE_NAMES))

        # make a list of names by calling ansible-doc
        checkoutdir = self.gitrepo.checkoutdir
        checkoutdir = os.path.abspath(checkoutdir)
        cmd = '. {}/hacking/env-setup; ansible-doc -t module -F'.format(checkoutdir)
        logging.debug(cmd)
        (rc, so, se) = run_command(cmd, cwd=checkoutdir)
        if rc:
            raise Exception("'ansible-doc' command failed (%s, %s %s)" % (rc, so, se))
        lines = so.split('\n')
        for line in lines:

            parts = line.split()
            parts = [x.strip() for x in parts]

            if len(parts) != 2 or checkoutdir not in line:
                continue

            mname = parts[0]
            if mname not in self.MODULE_NAMES:
                self.MODULE_NAMES.append(mname)

            fpath = parts[1]
            fpath = fpath.replace(checkoutdir + '/', '')

            if fpath not in self.MODULES:
                self.MODULES[fpath] = {
                    'name': mname,
                    'repo_filename': fpath,
                    'filename': fpath
                }

        _modules = self.MODULES.copy()
        for k, v in _modules.items():
            kparts = os.path.splitext(k)
            if kparts[-1] == '.ps1':
                _k = kparts[0] + '.py'
                checkpath = os.path.join(checkoutdir, _k)
                if not os.path.isfile(checkpath):
                    _k = k
            else:
                _k = k
            ME = ModuleExtractor(os.path.join(checkoutdir, _k), email_cache=self.email_cache)
            if k not in self.BOTMETA['files']:
                self.BOTMETA['files'][k] = {
                    'deprecated': os.path.basename(k).startswith('_'),
                    'labels': os.path.dirname(k).split('/'),
                    'authors': ME.authors,
                    'maintainers': ME.authors,
                    'maintainers_keys': [],
                    'notified': ME.authors,
                    'ignored': [],
                    'support': ME.metadata.get('supported_by', 'community'),
                    'metadata': ME.metadata.copy()
                }
            else:
                bmeta = self.BOTMETA['files'][k].copy()
                bmeta['metadata'] = ME.metadata.copy()
                if 'notified' not in bmeta:
                    bmeta['notified'] = []
                if 'maintainers' not in bmeta:
                    bmeta['maintainers'] = []
                if not bmeta.get('supported_by'):
                    bmeta['supported_by'] = ME.metadata.get('supported_by', 'community')
                if 'authors' not in bmeta:
                    bmeta['authors'] = []
                for x in ME.authors:
                    if x not in bmeta['authors']:
                        bmeta['authors'].append(x)
                    if x not in bmeta['maintainers']:
                        bmeta['maintainers'].append(x)
                    if x not in bmeta['notified']:
                        bmeta['notified'].append(x)
                if not bmeta.get('labels'):
                    bmeta['labels'] = os.path.dirname(k).split('/')
                bmeta['deprecated'] = os.path.basename(k).startswith('_')
                self.BOTMETA['files'][k].update(bmeta)

            # clean out the ignorees
            if 'ignored' in self.BOTMETA['files'][k]:
                for ignoree in self.BOTMETA['files'][k]['ignored']:
                    for thiskey in ['maintainers', 'notified']:
                        while ignoree in self.BOTMETA['files'][k][thiskey]:
                            self.BOTMETA['files'][k][thiskey].remove(ignoree)

            # write back to the modules
            self.MODULES[k].update(self.BOTMETA['files'][k])

    def load_meta(self):
        if self.botmetafile is not None:
            with open(self.botmetafile, 'rb') as f:
                rdata = f.read()
        else:
            fp = '.github/BOTMETA.yml'
            rdata = self.gitrepo.get_file_content(fp)
        self.BOTMETA = BotMetadataParser.parse_yaml(rdata)

    def cache_keywords(self):
        for k, v in self.BOTMETA['files'].items():
            if not v.get('keywords'):
                continue
            for kw in v['keywords']:
                if kw not in self.KEYWORDS:
                    self.KEYWORDS[kw] = k

    def clean_body(self, body, internal=False):
        body = body.lower()
        body = body.strip()
        for SC in self.STOPCHARS:
            if body.startswith(SC):
                body = body.lstrip(SC)
                body = body.strip()
            if body.endswith(SC):
                body = body.rstrip(SC)
                body = body.strip()
            if internal and SC in body:
                body = body.replace(SC, '')
                body = body.strip()
        body = body.strip()
        return body

    def match(self, issuewrapper):
        iw = issuewrapper
        matchdata = self.match_components(
            iw.title,
            iw.body,
            iw.template_data.get('component_raw'),
            files=iw.files
        )
        return matchdata

    def match_components(self, title, body, component, files=None):
        """Make a list of matching files with metadata"""

        self.strategy = None
        self.strategies = []

        # No matching necessary for PRs, but should provide consistent api
        if files:
            matched_filenames = files[:]
        else:
            matched_filenames = []
            if component is None:
                return matched_filenames

            component = component.encode('ascii', 'ignore')
            logging.debug('match "{}"'.format(component))

            delimiters = ['\n', ',', ' + ', ' & ']
            delimited = False
            for delimiter in delimiters:
                if delimiter in component:
                    delimited = True
                    components = component.split(delimiter)
                    for _component in components:
                        _matches = self._match_component(title, body, _component)
                        self.strategies.append(self.strategy)

                        # bypass for blacklist
                        if None in _matches:
                            _matches = []

                        matched_filenames += _matches

                    # do not process any more delimiters
                    break

            if not delimited:
                matched_filenames += self._match_component(title, body, component)
                self.strategies.append(self.strategy)

                # bypass for blacklist
                if None in matched_filenames:
                    return []

            # reduce subpaths
            if matched_filenames:
                matched_filenames = self.reduce_filepaths(matched_filenames)

        # create metadata for each matched file
        component_matches = []
        matched_filenames = sorted(set(matched_filenames))
        for fn in matched_filenames:
            component_matches.append(self.get_meta_for_file(fn))

        return component_matches

    def _match_component(self, title, body, component):
        """Find matches for a single line"""
        matched_filenames = []

        # context sets the path prefix to narrow the search window
        if 'module_util' in title.lower() or 'module_util' in component.lower():
            context = 'lib/ansible/module_utils'
        elif 'module util' in title.lower() or 'module util' in component.lower():
            context = 'lib/ansible/module_utils'
        elif 'module' in title.lower() or 'module' in component.lower():
            context = 'lib/ansible/modules'
        elif 'dynamic inventory' in title.lower() or 'dynamic inventory' in component.lower():
            context = 'contrib/inventory'
        elif 'inventory script' in title.lower() or 'inventory script' in component.lower():
            context = 'contrib/inventory'
        elif 'inventory plugin' in title.lower() or 'inventory plugin' in component.lower():
            context = 'lib/ansible/plugins/inventory'
        else:
            context = None

        if not component:
            return []

        if component not in self.STOPWORDS and component not in self.STOPCHARS:

            if not matched_filenames:
                matched_filenames += self.search_by_keywords(component, exact=True)
                if matched_filenames:
                    self.strategy = 'search_by_keywords'

            if not matched_filenames:
                matched_filenames += self.search_by_module_name(component)
                if matched_filenames:
                    self.strategy = 'search_by_module_name'

            if not matched_filenames:
                matched_filenames += self.search_by_regex_module_globs(component)
                if matched_filenames:
                    self.strategy = 'search_by_regex_module_globs'

            if not matched_filenames:
                matched_filenames += self.search_by_regex_modules(component)
                if matched_filenames:
                    self.strategy = 'search_by_regex_modules'

            if not matched_filenames:
                matched_filenames += self.search_by_regex_generic(component)
                if matched_filenames:
                    self.strategy = 'search_by_regex_generic'

            if not matched_filenames:
                matched_filenames += self.search_by_regex_urls(component)
                if matched_filenames:
                    self.strategy = 'search_by_regex_urls'

            if not matched_filenames:
                matched_filenames += self.search_by_tracebacks(component)
                if matched_filenames:
                    self.strategy = 'search_by_tracebacks'

            if not matched_filenames:
                matched_filenames += self.search_by_filepath(component, context=context)
                if matched_filenames:
                    self.strategy = 'search_by_filepath'
                if not matched_filenames:
                    matched_filenames += self.search_by_filepath(component, partial=True)
                    if matched_filenames:
                        self.strategy = 'search_by_filepath[partial]'

            if not matched_filenames:
                matched_filenames += self.search_by_keywords(component, exact=False)
                if matched_filenames:
                    self.strategy = 'search_by_keywords!exact'

            if matched_filenames:
                matched_filenames += self.include_modules_from_test_targets(matched_filenames)

        return matched_filenames

    def search_by_module_name(self, component):
        matches = []

        component = self.clean_body(component)

        # docker-container vs. docker_container
        if component not in self.MODULE_NAMES:
            component = component.replace('-', '_')

        if component in self.MODULE_NAMES:
            mmatch = self.find_module_match(component)
            if mmatch:
                if isinstance(mmatch, list):
                    for x in mmatch:
                        matches.append(x['repo_filename'])
                else:
                    matches.append(mmatch['repo_filename'])

        return matches

    def search_by_keywords(self, component, exact=True):
        """Simple keyword search"""

        component = component.lower()
        matches = []
        if component in self.STOPWORDS:
            matches = [None]
        elif component in self.KEYWORDS:
            matches = [self.KEYWORDS[component]]
        elif not exact:
            for k, v in self.KEYWORDS.items():
                if ' ' + k + ' ' in component or ' ' + k + ' ' in component.lower():
                    logging.debug('keyword match: {}'.format(k))
                    matches.append(v)
                elif ' ' + k + ':' in component or ' ' + k + ':' in component:
                    logging.debug('keyword match: {}'.format(k))
                    matches.append(v)
                elif component.endswith(' ' + k) or component.lower().endswith(' ' + k):
                    logging.debug('keyword match: {}'.format(k))
                    matches.append(v)

                elif (k in component or k in component.lower()) and k in self.BLACKLIST:
                    logging.debug('blacklist  match: {}'.format(k))
                    matches.append(None)

        return matches

    def search_by_regex_urls(self, body):
        # http://docs.ansible.com/ansible/latest/copy_module.html
        # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html
        # http://docs.ansible.com/ansible/latest/postgresql_db_module.html
        # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html)
        # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html
        # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html)
        # https//github.com/ansible/ansible/blob/devel/lib/ansible/modules/windows/win_dsc.ps1L228

        matches = []

        urls = re.findall(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            body
        )
        if urls:
            for url in urls:
                url = url.rstrip(')')
                if '/blob' in url and url.endswith('.py'):
                    parts = url.split('/')
                    bindex = parts.index('blob')
                    fn = '/'.join(parts[bindex+2:])
                    matches.append(fn)
                elif '_module.html' in url:
                    parts = url.split('/')
                    fn = parts[-1].replace('_module.html', '')
                    choices = [x for x in self.gitrepo.files if '/' + fn in x or '/_' + fn in x]
                    choices = [x for x in choices if 'lib/ansible/modules' in x]

                    if len(choices) > 1:
                        choices = [x for x in choices if '/' + fn + '.py' in x or '/' + fn + '.ps1' in x or '/_' + fn + '.py' in x]

                    if not choices:
                        pass
                    elif len(choices) == 1:
                        matches.append(choices[0])
                    else:
                        pass
                else:
                    pass

        return matches

    def search_by_regex_modules(self, body):
        # foo module
        # foo and bar modules
        # foo* modules
        # foo* module

        body = body.lower()
        logging.debug('attempt regex match on: {}'.format(body))

        # https://www.tutorialspoint.com/python/python_reg_expressions.htm
        patterns = [
            r'\:\n(\S+)\.py',
            r'(\S+)\.py',
            r'\-(\s+)(\S+)(\s+)module',
            r'\`ansible_module_(\S+)\.py\`',
            r'module(\s+)\-(\s+)(\S+)',
            r'module(\s+)(\S+)',
            r'\`(\S+)\`(\s+)module',
            r'(\S+)(\s+)module',
            r'the (\S+) command',
            r'(\S+) \(.*\)',
            r'(\S+)\-module',
            r'modules/(\S+)',
            r'module\:(\s+)\`(\S+)\`',
            r'module\: (\S+)',
            r'module (\S+)',
            r'module `(\S+)`',
            r'module: (\S+)',
            r'new (\S+) module',
            r'the (\S+) module',
            r'the \"(\S+)\" module',
            r':\n(\S+) module',
            r'(\S+) module',
            r'(\S+) core module',
            r'(\S+) extras module',
            r':\n\`(\S+)\` module',
            r'\`(\S+)\` module',
            r'`(\S+)` module',
            r'(\S+)\* modules',
            r'(\S+) and (\S+)',
            r'(\S+) or (\S+)',
            r'(\S+) \+ (\S+)',
            r'(\S+) \& (\S)',
            r'(\S+) and (\S+) modules',
            r'(\S+) or (\S+) module',
            r'(\S+)_module',
            r'action: (\S+)',
            r'action (\S+)',
            r'ansible_module_(\S+)\.py',
            r'ansible_module_(\S+)',
            r'ansible_modules_(\S+)\.py',
            r'ansible_modules_(\S+)',
            r'(\S+) task',
            r'(\s+)\((\S+)\)',
            r'(\S+)(\s+)(\S+)(\s+)modules',
            r'(\S+)(\s+)module\:(\s+)(\S+)',
            r'\-(\s+)(\S+)(\s+)module',
            r'\:(\s+)(\S+)(\s+)module',
            r'\-(\s+)ansible(\s+)(\S+)(\s+)(\S+)(\s+)module',
            r'.*(\s+)(\S+)(\s+)module.*'
        ]

        matches = []

        logging.debug('check patterns against: {}'.format(body))

        for pattern in patterns:
            mobj = re.match(pattern, body, re.M | re.I)

            if mobj:
                logging.debug('pattern {} matched on "{}"'.format(pattern, body))

                for x in range(0, mobj.lastindex+1):
                    try:
                        mname = mobj.group(x)
                        logging.debug('mname: {}'.format(mname))
                        if mname == body:
                            continue
                        mname = self.clean_body(mname)
                        if not mname.strip():
                            continue
                        mname = mname.strip().lower()
                        if ' ' in mname:
                            continue
                        if '/' in mname:
                            continue

                        mname = mname.replace('.py', '').replace('.ps1', '')
                        logging.debug('--> {}'.format(mname))

                        # attempt to match a module
                        module_match = self.find_module_match(mname)

                        if not module_match:
                            pass
                        elif isinstance(module_match, list):
                            for m in module_match:
                                matches.append(m['repo_filename'])
                        elif isinstance(module_match, dict):
                            matches.append(module_match['repo_filename'])
                    except Exception as e:
                        logging.error(e)

                if matches:
                    break

        return matches

    def search_by_regex_module_globs(self, body):
        # All AWS modules
        # BigIP modules
        # NXOS modules
        # azurerm modules

        matches = []
        body = self.clean_body(body)
        logging.debug('try globs on: {}'.format(body))

        keymap = {
            'all': None,
            'ec2': 'lib/ansible/modules/cloud/amazon',
            'ec2_*': 'lib/ansible/modules/cloud/amazon',
            'aws': 'lib/ansible/modules/cloud/amazon',
            'amazon': 'lib/ansible/modules/cloud/amazon',
            'google': 'lib/ansible/modules/cloud/google',
            'gce': 'lib/ansible/modules/cloud/google',
            'gcp': 'lib/ansible/modules/cloud/google',
            'bigip': 'lib/ansible/modules/network/f5',
            'nxos': 'lib/ansible/modules/network/nxos',
            'azure': 'lib/ansible/modules/cloud/azure',
            'azurerm': 'lib/ansible/modules/cloud/azure',
            'openstack': 'lib/ansible/modules/cloud/openstack',
            'ios': 'lib/ansible/modules/network/ios',
        }

        regexes = [
            r'(\S+) ansible modules',
            r'all (\S+) based modules',
            r'all (\S+) modules',
            r'.* all (\S+) modules.*',
            r'(\S+) modules',
            r'(\S+\*) modules',
            r'all cisco (\S+\*) modules',
        ]

        mobj = None
        for x in regexes:
            mobj = re.match(x, body)
            if mobj:
                logging.debug('matched glob: {}'.format(x))
                break

        if not mobj:
            logging.debug('no glob matches')

        if mobj:
            keyword = mobj.group(1)
            if not keyword.strip():
                pass
            elif keyword in keymap:
                if keymap[keyword]:
                    matches.append(keymap[keyword])
            else:

                if '*' in keyword:
                    keyword = keyword.replace('*', '')

                # check for directories first
                fns = [x for x in self.MODULE_NAMESPACE_DIRECTORIES if keyword in x]

                # check for files second
                if not fns:
                    fns = [x for x in self.gitrepo.module_files if 'lib/ansible/modules' in x and keyword in x]

                if fns:
                    matches += fns

        if matches:
            matches = sorted(set(matches))

        return matches

    def search_by_regex_generic(self, body):
        # foo dynamic inventory script
        # foo filter

        # https://www.tutorialspoint.com/python/python_reg_expressions.htm
        patterns = [
            [r'(.*) action plugin', 'lib/ansible/plugins/action'],
            [r'(.*) inventory plugin', 'lib/ansible/plugins/inventory'],
            [r'(.*) dynamic inventory', 'contrib/inventory'],
            [r'(.*) dynamic inventory (script|file)', 'contrib/inventory'],
            [r'(.*) inventory script', 'contrib/inventory'],
            [r'(.*) filter', 'lib/ansible/plugins/filter'],
            [r'(.*) jinja filter', 'lib/ansible/plugins/filter'],
            [r'(.*) jinja2 filter', 'lib/ansible/plugins/filter'],
            [r'(.*) template filter', 'lib/ansible/plugins/filter'],
            [r'(.*) fact caching plugin', 'lib/ansible/plugins/cache'],
            [r'(.*) fact caching module', 'lib/ansible/plugins/cache'],
            [r'(.*) lookup plugin', 'lib/ansible/plugins/lookup'],
            [r'(.*) lookup', 'lib/ansible/plugins/lookup'],
            [r'(.*) callback plugin', 'lib/ansible/plugins/callback'],
            [r'(.*)\.py callback', 'lib/ansible/plugins/callback'],
            [r'callback plugin (.*)', 'lib/ansible/plugins/callback'],
            [r'(.*) stdout callback', 'lib/ansible/plugins/callback'],
            [r'stdout callback (.*)', 'lib/ansible/plugins/callback'],
            [r'stdout_callback (.*)', 'lib/ansible/plugins/callback'],
            [r'(.*) callback plugin', 'lib/ansible/plugins/callback'],
            [r'(.*) connection plugin', 'lib/ansible/plugins/connection'],
            [r'(.*) connection type', 'lib/ansible/plugins/connection'],
            [r'(.*) connection', 'lib/ansible/plugins/connection'],
            [r'(.*) transport', 'lib/ansible/plugins/connection'],
            [r'connection=(.*)', 'lib/ansible/plugins/connection'],
            [r'connection: (.*)', 'lib/ansible/plugins/connection'],
            [r'connection (.*)', 'lib/ansible/plugins/connection'],
            [r'strategy (.*)', 'lib/ansible/plugins/strategy'],
            [r'(.*) strategy plugin', 'lib/ansible/plugins/strategy'],
            [r'(.*) module util', 'lib/ansible/module_utils'],
            [r'ansible-galaxy (.*)', 'lib/ansible/galaxy'],
            [r'ansible-playbook (.*)', 'lib/ansible/playbook'],
            [r'ansible/module_utils/(.*)', 'lib/ansible/module_utils'],
            [r'module_utils/(.*)', 'lib/ansible/module_utils'],
            [r'lib/ansible/module_utils/(.*)', 'lib/ansible/module_utils'],
            [r'(\S+) documentation fragment', 'lib/ansible/utils/module_docs_fragments'],
        ]

        body = self.clean_body(body)

        matches = []

        for pattern in patterns:
            mobj = re.match(pattern[0], body, re.M | re.I)

            if mobj:
                logging.debug('pattern hit: {}'.format(pattern))
                fname = mobj.group(1)
                fname = fname.lower()

                fpath = os.path.join(pattern[1], fname)

                if fpath in self.gitrepo.files:
                    matches.append(fpath)
                elif os.path.join(pattern[1], fname + '.py') in self.gitrepo.files:
                    fname = os.path.join(pattern[1], fname + '.py')
                    matches.append(fname)
                else:
                    # fallback to the directory
                    matches.append(pattern[1])

        return matches

    def search_by_tracebacks(self, body):

        matches = []

        if 'Traceback (most recent call last)' in body:
            lines = body.split('\n')
            for line in lines:
                line = line.strip()
                if line.startswith('DistributionNotFound'):
                    matches = ['setup.py']
                    break
                elif line.startswith('File'):
                    fn = line.split()[1]
                    for SC in self.STOPCHARS:
                        fn = fn.replace(SC, '')
                    if 'ansible_module_' in fn:
                        fn = os.path.basename(fn)
                        fn = fn.replace('ansible_module_', '')
                        matches = [fn]
                    elif 'cli/playbook.py' in fn:
                        fn = 'lib/ansible/cli/playbook.py'
                    elif 'module_utils' in fn:
                        idx = fn.find('module_utils/')
                        fn = 'lib/ansible/' + fn[idx:]
                    elif 'ansible/' in fn:
                        idx = fn.find('ansible/')
                        fn1 = fn[idx:]

                        if 'bin/' in fn1:
                            if not fn1.startswith('bin'):

                                idx = fn1.find('bin/')
                                fn1 = fn1[idx:]

                                if fn1.endswith('.py'):
                                    fn1 = fn1.rstrip('.py')

                        elif 'cli/' in fn1:
                            idx = fn1.find('cli/')
                            fn1 = fn1[idx:]
                            fn1 = 'lib/ansible/' + fn1

                        elif 'lib' not in fn1:
                            fn1 = 'lib/' + fn1

                        if fn1 not in self.files:
                            pass

        return matches

    def search_by_filepath(self, body, partial=False, context=None):
        """Find known filepaths in body"""

        matches = []
        body = self.clean_body(body)

        if not body:
            return []
        if body.lower() in self.STOPCHARS:
            return []
        if body.lower() in self.STOPWORDS:
            return []

        # 'inventory manager' vs. 'inventory/manager'
        if partial and ' ' in body:
            body = body.replace(' ', '/')

        if 'site-packages' in body:
            res = re.match('(.*)/site-packages/(.*)', body)
            body = res.group(2)
        if 'modules/core/' in body:
            body = body.replace('modules/core/', 'modules/')
        if 'modules/extras/' in body:
            body = body.replace('modules/extras/', 'modules/')
        if 'ansible-modules-core/' in body:
            body = body.replace('ansible-modules-core/', '/')
        if 'ansible-modules-extras/' in body:
            body = body.replace('ansible-modules-extras/', '/')
        if body.startswith('ansible/lib/ansible'):
            body = body.replace('ansible/lib', 'lib')
        if body.startswith('ansible/') and not body.startswith('ansible/modules'):
            body = body.replace('ansible/', '', 1)
        if 'module/' in body:
            body = body.replace('module/', 'modules/')

        logging.debug('search filepath [{}] [{}]: {}'.format(context, partial, body))

        if len(body) < 2:
            return []

        if '/' in body:
            body_paths = body.split('/')
        elif ' ' in body:
            body_paths = body.split()
            body_paths = [x.strip() for x in body_paths if x.strip()]
        else:
            body_paths = [body]

        if 'networking' in body_paths:
            ix = body_paths.index('networking')
            body_paths[ix] = 'network'
        if 'plugin' in body_paths:
            ix = body_paths.index('plugin')
            body_paths[ix] = 'plugins'

        if not context or 'lib/ansible/modules' in context:
            mmatch = self.find_module_match(body)
            if mmatch:
                if isinstance(mmatch, list) and len(mmatch) > 1:

                    # only allow for exact prefix globbing here ...
                    if [x for x in mmatch if x['repo_filename'].startswith(body)]:
                        return [x['repo_filename'] for x in mmatch]

                elif isinstance(mmatch, list):
                    return [x['repo_filename'] for x in mmatch]
                else:
                    return [mmatch['repo_filename']]

        if body in self.gitrepo.files:
            matches = [body]
        else:
            for fn in self.gitrepo.files:

                # limit the search set if a context is given
                if context is not None and not fn.startswith(context):
                    continue

                if fn.endswith(body) or fn.endswith(body + '.py') or fn.endswith(body + '.ps1'):
                    # ios_config.py -> test_ios_config.py vs. ios_config.py
                    bn1 = os.path.basename(body)
                    bn2 = os.path.basename(fn)
                    if bn2.startswith(bn1):
                        matches = [fn]
                        break

                if partial:

                    # netapp_e_storagepool storage module
                    # lib/ansible/modules/storage/netapp/netapp_e_storagepool.py

                    # if all subpaths are in this filepath, it is a match
                    bp_total = 0
                    fn_paths = fn.split('/')
                    fn_paths.append(fn_paths[-1].replace('.py', '').replace('.ps1', ''))

                    for bp in body_paths:
                        if bp in fn_paths:
                            bp_total += 1

                    if bp_total == len(body_paths):
                        matches = [fn]
                        break

                    elif bp_total > 1:

                        if (float(bp_total) / float(len(body_paths))) >= (2.0 / 3.0):
                            if fn not in matches:
                                matches.append(fn)

        if matches:
            tr = []
            for match in matches[:]:
                # reduce to longest path
                for m in matches:
                    if match == m:
                        continue
                    if len(m) < match and match.startswith(m):
                        tr.append(m)

            for r in tr:
                if r in matches:
                    logging.debug('trimming {}'.format(r))
                    matches.remove(r)

        matches = sorted(set(matches))
        logging.debug('return: {}'.format(matches))

        return matches

    def reduce_filepaths(self, matches):

        # unique
        _matches = []
        for _match in matches:
            if _match not in _matches:
                _matches.append(_match)
        matches = _matches[:]

        # squash to longest path
        if matches:
            tr = []
            for match in matches[:]:
                # reduce to longest path
                for m in matches:
                    if match == m:
                        continue
                    if m is None or match is None:
                        continue
                    if len(m) < match and match.startswith(m) or match.endswith(m):
                        tr.append(m)

            for r in tr:
                if r in matches:
                    matches.remove(r)
        return matches

    def include_modules_from_test_targets(self, matches):
        """Map test targets to the module files"""
        new_matches = []
        for match in matches:
            if not match:
                continue
            # include modules from test targets
            if 'test/integration/targets' in match:
                paths = match.split('/')
                tindex = paths.index('targets')
                mname = paths[tindex+1]
                mrs = self.find_module_match(mname, exact=True)
                if mrs:
                    if not isinstance(mrs, list):
                        mrs = [mrs]
                    for mr in mrs:
                        new_matches.append(mr['repo_filename'])
        return new_matches

    def get_meta_for_file(self, filename):
        meta = {
            'repo_filename': filename,
            'name': os.path.basename(filename).split('.')[0],
            'notify': [],
            'assign': [],
            'authors': [],
            'committers': [],
            'maintainers': [],
            'labels': [],
            'ignore': [],
            'support': None,
            'supported_by': None,
            'deprecated': False,
            'topic': None,
            'subtopic': None,
            'namespace': None,
            'namespace_maintainers': []
        }

        populated = False
        filenames = [filename, os.path.splitext(filename)[0]]

        # powershell meta is in the python file
        if filename.endswith('.ps1'):
            pyfile = filename.replace('.ps1', '.py')
            if pyfile in self.BOTMETA['files']:
                filenames.append(pyfile)

        botmeta_entries = self.file_indexer._filenames_to_keys(filenames)

        for entry in botmeta_entries:
            fdata = self.BOTMETA['files'][entry].copy()

            if 'authors' in fdata:
                meta['authors'] = fdata['authors']
            if 'maintainers' in fdata:
                meta['notify'] += fdata['maintainers']
                meta['assign'] += fdata['maintainers']
                meta['maintainers'] += fdata['maintainers']
            if 'notified' in fdata:
                meta['notify'] += fdata['notified']
            if 'labels' in fdata:
                meta['labels'] += fdata['labels']
            if 'ignore' in fdata:
                meta['ignore'] += fdata['ignore']
            if 'ignored' in fdata:
                meta['ignore'] += fdata['ignored']
            if 'support' in fdata:
                if isinstance(fdata['support'], list):
                    meta['support'] = fdata['support'][0]
                else:
                    meta['support'] = fdata['support']
            elif 'supported_by' in fdata:
                if isinstance(fdata['supported_by'], list):
                    meta['support'] = fdata['supported_by'][0]
                else:
                    meta['support'] = fdata['supported_by']

            if 'deprecated' in fdata:
                meta['deprecated'] = fdata['deprecated']

            populated = True

        # walk up the tree for more meta
        paths = filename.split('/')
        for idx, x in enumerate(paths):
            thispath = '/'.join(paths[:(0-idx)])
            if thispath in self.BOTMETA['files']:
                fdata = self.BOTMETA['files'][thispath].copy()
                if 'support' in fdata and not meta['support']:
                    if isinstance(fdata['support'], list):
                        meta['support'] = fdata['support'][0]
                    else:
                        meta['support'] = fdata['support']
                if 'labels' in fdata:
                    meta['labels'] += fdata['labels']
                if 'maintainers' in fdata:
                    meta['notify'] += fdata['maintainers']
                    meta['assign'] += fdata['maintainers']
                    meta['maintainers'] += fdata['maintainers']
                if 'ignore' in fdata:
                    meta['ignore'] += fdata['ignore']
                if 'notified' in fdata:
                    meta['notify'] += fdata['notified']

        if 'lib/ansible/modules' in filename:
            topics = [x for x in paths if x not in ['lib', 'ansible', 'modules']]
            topics = [x for x in topics if x != os.path.basename(filename)]
            if len(topics) == 2:
                meta['topic'] = topics[0]
                meta['subtopic'] = topics[1]
            elif len(topics) == 1:
                meta['topic'] = topics[0]

            meta['namespace'] = '/'.join(topics)

        # set namespace maintainers (skip !modules for now)
        if filename.startswith('lib/ansible/modules'):
            ns = meta.get('namespace')
            keys = self.BOTMETA['files'].keys()
            keys = [x for x in keys if x.startswith(os.path.join('lib/ansible/modules', ns))]
            ignored = []

            for key in keys:
                meta['namespace_maintainers'] += self.BOTMETA['files'][key].get('maintainers', [])
                ignored += self.BOTMETA['files'][key].get('ignored', [])

            for ignoree in ignored:
                while ignoree in meta['namespace_maintainers']:
                    meta['namespace_maintainers'].remove(ignoree)

        # new modules should default to "community" support
        if filename.startswith('lib/ansible/modules') and filename not in self.gitrepo.files:
            meta['support'] = 'community'
            meta['supported_by'] = 'community'

        # test targets for modules should inherit from their modules
        if filename.startswith('test/integration/targets') and filename not in self.BOTMETA['files']:
            whitelist = [
                'labels',
                'ignore',
                'deprecated',
                'authors',
                'assign',
                'maintainers',
                'notify',
                'topic',
                'subtopic',
                'support'
            ]

            paths = filename.split('/')
            tindex = paths.index('targets')
            mname = paths[tindex+1]
            mmatch = self._find_module_match(mname, exact=True)
            if mmatch:
                mmeta = self.get_meta_for_file(mmatch[0]['repo_filename'])
                for k, v in mmeta.items():
                    if k in whitelist and v:
                        if isinstance(meta[k], list):
                            meta[k] = sorted(set(meta[k] + v))
                        elif not meta[k]:
                            meta[k] = v

            # make new test targets community by default
            if not meta['support'] and not meta['supported_by']:
                meta['support'] = 'community'

        # it's okay to remove things from legacy-files.txt
        if filename == 'test/sanity/pep8/legacy-files.txt' and not meta['support']:
            meta['support'] = 'community'

        # fallback to core support
        if not meta['support']:
            meta['support'] = 'core'

        # align support and supported_by
        if meta['support'] != meta['supported_by']:
            if meta['support'] and not meta['supported_by']:
                meta['supported_by'] = meta['support']
            elif not meta['support'] and meta['supported_by']:
                meta['support'] = meta['supported_by']

        # clean up the result
        _meta = meta.copy()
        for k, v in _meta.items():
            if isinstance(v, list):
                meta[k] = sorted(set(v))

        # walk up the botmeta tree looking for ignores to include
        if meta.get('repo_filename'):
            namespace_paths = os.path.dirname(meta['repo_filename'])
            namespace_paths = namespace_paths.split('/')
            for x in reversed(range(0, len(namespace_paths) + 1)):
                this_ns_path = '/'.join(namespace_paths[:x])
                if not this_ns_path:
                    continue
                print('check {}'.format(this_ns_path))
                if this_ns_path in self.BOTMETA['files']:
                    this_ignore = self.BOTMETA['files'][this_ns_path].get('ignore') or \
                        self.BOTMETA['files'][this_ns_path].get('ignored') or \
                        self.BOTMETA['files'][this_ns_path].get('ignores')
                    print('ignored: {}'.format(this_ignore))
                    if this_ignore:
                        for username in this_ignore:
                            if username not in meta['ignore']:
                                meta['ignore'].append(username)

        # process ignores AGAIN.
        if meta.get('ignore'):
            for k, v in meta.items():
                if k == 'ignore':
                    continue
                if not isinstance(v, list):
                    continue
                for ignoree in meta['ignore']:
                    if ignoree in v:
                        meta[k].remove(ignoree)

        return meta

    def find_module_match(self, pattern, exact=False):
        '''Exact module name matching'''

        logging.debug('find_module_match for "{}"'.format(pattern))
        candidate = None

        BLACKLIST = [
            'module_utils',
            'callback',
            'network modules',
            'networking modules'
            'windows modules'
        ]

        if not pattern or pattern is None:
            return None

        # https://github.com/ansible/ansible/issues/19755
        if pattern == 'setup':
            pattern = 'lib/ansible/modules/system/setup.py'

        if '/facts.py' in pattern or ' facts.py' in pattern:
            pattern = 'lib/ansible/modules/system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if '-' in pattern:
            pattern = pattern.replace('-', '_')

        if 'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif 'callback' in pattern:
            return None
        elif 'lookup' in pattern:
            return None
        elif 'contrib' in pattern and 'inventory' in pattern:
            return None
        elif pattern.lower() in BLACKLIST:
            return None

        candidate = self._find_module_match(pattern, exact=exact)

        if not candidate:
            candidate = self._find_module_match(os.path.basename(pattern))

        if not candidate and '/' in pattern and not pattern.startswith('lib/'):
            ppy = None
            ps1 = None
            if not pattern.endswith('.py') and not pattern.endswith('.ps1'):
                ppy = pattern + '.py'
            if not pattern.endswith('.py') and not pattern.endswith('.ps1'):
                ps1 = pattern + '.ps1'
            for mf in self.gitrepo.module_files:
                if pattern in mf:
                    if mf.endswith(pattern) or mf.endswith(ppy) or mf.endswith(ps1):
                        candidate = mf
                        break

        return candidate

    def _find_module_match(self, pattern, exact=False):

        logging.debug('matching on {}'.format(pattern))

        matches = []

        if isinstance(pattern, unicode):
            pattern = pattern.encode('ascii', 'ignore')

        logging.debug('_find_module_match: {}'.format(pattern))

        noext = pattern.replace('.py', '').replace('.ps1', '')

        # exact is looking for a very precise name such as "vmware_guest"
        if exact:
            candidates = [pattern]
        else:
            candidates = [pattern, '_' + pattern, noext, '_' + noext]

        for k, v in self.MODULES.items():
            if v['name'] in candidates:
                logging.debug('match {} on name: {}'.format(k, v['name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in self.MODULES.items():
                if k == pattern:
                    logging.debug('match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        # spellcheck
        if not exact and not matches and '/' not in pattern:
            _pattern = pattern
            if not isinstance(_pattern, unicode):
                _pattern = _pattern.decode('utf-8')
            candidates = []
            for k, v in self.MODULES.items():
                vname = v['name']
                if not isinstance(vname, unicode):
                    vname = vname.decode('utf-8')
                jw = jaro_winkler(vname, _pattern)
                if jw > .9:
                    candidates.append((jw, k))
            for candidate in candidates:
                matches.append(self.MODULES[candidate[1]])

        return matches
Example #9
0
    def index_ecosystem(self):
        # index the ansible-collections org
        token = C.DEFAULT_GITHUB_TOKEN
        gh = Github(login_or_token=token)
        gw = GithubWrapper(gh, cachedir=self.cachedir)
        ac = gw.get_org('ansible-collections')

        cloneurls = set()
        for repo in ac.get_repos():
            #print(repo)
            cloneurls.add(repo.clone_url)
        cloneurls = [x.replace('.git', '') for x in cloneurls]

        for curl in cloneurls:
            if curl.endswith('/overview'):
                continue
            if curl.endswith('/collection_template'):
                continue
            if curl.endswith('/.github'):
                continue
            if curl.endswith('/hub'):
                continue
            grepo = GitRepoWrapper(cachedir=self.cachedir,
                                   repo=curl,
                                   rebase=False)

            # is there a galaxy.yml at the root level?
            if grepo.exists('galaxy.yml'):
                meta = yaml.load(grepo.get_file_content('galaxy.yml'))
                fqcn = '%s.%s' % (meta['namespace'], meta['name'])
                self._gitrepos[fqcn] = grepo
            else:
                # multi-collection repos ... sigh.
                galaxyfns = grepo.find('galaxy.yml')

                if galaxyfns:
                    for gfn in galaxyfns:
                        meta = yaml.load(grepo.get_file_content(gfn))
                        fqcn = '%s.%s' % (meta['namespace'], meta['name'])
                        _grepo = GitRepoWrapper(cachedir=self.cachedir,
                                                repo=curl,
                                                rebase=False,
                                                context=os.path.dirname(gfn))
                        self._gitrepos[fqcn] = _grepo
                else:

                    fqcn = None
                    bn = os.path.basename(curl)

                    # enumerate the url?
                    if '.' in bn:
                        fqcn = bn

                    # try the README?
                    if fqcn is None:
                        for fn in ['README.rst', 'README.md']:
                            if fqcn:
                                break
                            if not grepo.exists(fn):
                                continue
                            fdata = grepo.get_file_content(fn)
                            if not '.' in fdata:
                                continue
                            lines = fdata.split('\n')
                            for line in lines:
                                line = line.strip()
                                if line.lower().startswith(
                                        'ansible collection:'):
                                    fqcn = line.split(':')[-1].strip()
                                    break

                    # lame ...
                    if fqcn is None:
                        fqcn = bn + '._community'

                    self._gitrepos[fqcn] = grepo

        # scrape the galaxy collections api
        nexturl = self._baseurl + '/api/v2/collections/?page_size=1000'
        while nexturl:
            jdata = self._get_cached_url(nexturl)
            nexturl = jdata.get('next_link')
            if nexturl:
                nexturl = self._baseurl + nexturl

            for res in jdata.get('results', []):
                fqcn = '%s.%s' % (res['namespace']['name'], res['name'])
                if res.get('deprecated'):
                    continue
                if fqcn in self._gitrepos:
                    continue
                lv = res['latest_version']['href']
                lvdata = self._get_cached_url(lv)
                rurl = lvdata.get('metadata', {}).get('repository')
                if rurl is None:
                    rurl = lvdata['download_url']
                grepo = GitRepoWrapper(cachedir=self.cachedir,
                                       repo=rurl,
                                       rebase=False)
                self._gitrepos[fqcn] = grepo

        # reconcile all things ...
        self.GALAXY_FQCNS = sorted(set(self._gitrepos.keys()))
        self.GALAXY_FILES = {}
        for fqcn, gr in self._gitrepos.items():
            if fqcn.startswith('testing.'):
                continue
            for fn in gr.files:
                if fn not in self.GALAXY_FILES:
                    self.GALAXY_FILES[fn] = set()
                self.GALAXY_FILES[fn].add(fqcn)
Example #10
0
def main():

    tocheck = [
        #32226,
        #30361,
        #31006,
        #58674,
        #63611,
        #64320,
        #66891,
        #68784,
        69010,
    ]

    redirect = set()
    noredirect = set()
    nometa = set()

    cachedir = '/home/jtanner/.ansibullbot/cache'
    gitrepo = GitRepoWrapper(cachedir=cachedir,
                             repo='https://github.com/ansible/ansible',
                             commit=None,
                             rebase=False)
    rdata = gitrepo.get_file_content(u'.github/BOTMETA.yml')
    botmeta = BotMetadataParser.parse_yaml(rdata)
    cm = AnsibleComponentMatcher(cachedir=cachedir,
                                 gitrepo=gitrepo,
                                 botmeta=botmeta,
                                 botmetafile=None,
                                 email_cache=None,
                                 usecache=True,
                                 use_galaxy=True)
    '''
    mr = parse_match_results()
    for issue in sorted(mr.keys(), key=lambda x: int(x.split('/')[-1]), reverse=True):

        print(issue)
        number = int(issue.split('/')[-1])
        #if number != 68709:
        #    continue
        print(number)
        mfile = os.path.join('~/.ansibullbot/cache/ansible/ansible/issues/%s' % number, 'meta.json')
        mfile = os.path.expanduser(mfile)
        if os.path.exists(mfile):
            with open(mfile, 'r') as f:
                imeta = json.loads(f.read())
        else:
            nometa.add(issue)
            imeta = {}

        if imeta:

            iw = MockIssueWrapper(issue, meta=imeta)
            cfacts = get_collection_facts(iw, cm, imeta)
            #pprint(cfacts)

            if cfacts.get('needs_collection_redirect') == True:
                redirect.add(issue)
            else:
                noredirect.add(issue)

            #if not imeta['is_backport']:
            #    import epdb; epdb.st()
    '''

    mmap = {}

    #gmatches = cm.search_ecosystem('contrib/inventory/ec2.py')
    #import epdb; epdb.st()

    mfiles = get_issues()
    for mfile in mfiles:
        with open(mfile, 'r') as f:
            imeta = json.loads(f.read())
        print(imeta['html_url'])
        number = int(imeta['html_url'].split('/')[-1])
        if number not in tocheck:
            continue

        newmeta = copy.deepcopy(imeta)
        iw = MockIssueWrapper(imeta['html_url'], meta=newmeta, gitrepo=gitrepo)
        #cmatches = cm.match_components(iw.title, iw.body, iw.component)
        cmmeta = get_component_match_facts(iw, cm, [])
        newmeta.update(cmmeta)
        cfmeta = get_collection_facts(iw, cm, newmeta)

        # check api deltas ...
        #cm1 = cm.match(iw)
        #cm2 = cm.match_components(iw.title, iw.body, iw.component, files=iw.files)
        #import epdb; epdb.st()

        print('component: %s' % iw.component)
        print(cmmeta['component_filenames'])
        #pprint(cfmeta)
        cf2vals = [x for x in list(cfmeta['collection_filemap'].values()) if x]
        cf1vals = [x for x in list(imeta['collection_filemap'].values()) if x]
        '''
        if cf1vals or cf2vals:
            pprint(cf1vals)
            pprint(cf2vals)
            #import epdb; epdb.st()
        '''
        '''
        if cf2vals != cf1vals:
            pprint(cf1vals)
            pprint(cf2vals)
            import epdb; epdb.st()
        '''
        pprint(cfmeta)
        import epdb
        epdb.st()

    print('# %s total issues|PRs without meta' % len(list(nometa)))
    print('# %s total issues|PRs not redirected to collections' %
          len(list(noredirect)))
    print('# %s total issues|PRs redirected to collections' %
          len(list(redirect)))

    import epdb
    epdb.st()
Example #11
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        u'authors': [],
        u'name': None,
        u'namespaced_module': None,
        u'namespace_maintainers': [],
        u'deprecated': False,
        u'deprecated_filename': None,
        u'dirpath': None,
        u'filename': None,
        u'filepath': None,
        u'fulltopic': None,
        u'maintainers': [],
        u'_maintainers': [],
        u'maintainers_keys': None,
        u'metadata': {},
        u'repo_filename': None,
        u'repository': u'ansible',
        u'subtopic': None,
        u'topic': None,
        u'imports': []
    }

    def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        self.botmeta = {}  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible')

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)

    def update(self, force=False):
        '''Reload everything if there are new commits'''
        changed = self.gitrepo.manage_checkout()
        if changed or force:
            self.get_files()
            self.parse_metadata()

    def get_files(self):
        '''Cache a list of filenames in the checkout'''
        cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir)
        (rc, so, se) = run_command(cmd)
        files = to_text(so).split(u'\n')
        files = [x.strip() for x in files if x.strip()]
        self.files = files

    def parse_metadata(self):

        if self.botmetafile is not None:
            with open(self.botmetafile, 'rb') as f:
                rdata = f.read()
        else:
            fp = u'.github/BOTMETA.yml'
            rdata = self.get_file_content(fp)
        self.botmeta = BotMetadataParser.parse_yaml(rdata)

        # load the modules
        logging.info(u'loading modules')
        self.get_ansible_modules()

    def _find_match(self, pattern, exact=False):

        logging.debug(u'exact:{} matching on {}'.format(exact, pattern))

        matches = []

        if isinstance(pattern, six.text_type):
            pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii')

        for k, v in six.iteritems(self.modules):
            if v[u'name'] == pattern:
                logging.debug(u'match {} on name: {}'.format(k, v[u'name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in six.iteritems(self.modules):
                if k == pattern:
                    logging.debug(u'match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        if not matches and not exact:
            # search by properties
            for k, v in six.iteritems(self.modules):
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        logging.debug(u'match {} on subkey: {}'.format(k, subkey))
                        matches.append(v)

        if not matches and not exact:
            # Levenshtein distance should workaround most typos
            distance_map = {}
            for k, v in six.iteritems(self.modules):
                mname = v.get(u'name')
                if not mname:
                    continue
                if isinstance(mname, six.text_type):
                    mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii')
                try:
                    res = Levenshtein.distance(pattern, mname)
                except TypeError as e:
                    logging.error(e)
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb; epdb.st()
                distance_map[mname] = [res, k]
            res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True)
            if len(pattern) > 3 > res[-1][1]:
                logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern))
                matches = [self.modules[res[-1][-1]]]

        return matches

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''

        logging.debug(u'find_match for "{}"'.format(pattern))

        BLACKLIST = [
            u'module_utils',
            u'callback',
            u'network modules',
            u'networking modules'
            u'windows modules'
        ]

        if not pattern or pattern is None:
            return None

        if pattern.lower() == u'core':
            return None

        '''
        if 'docs.ansible.com' in pattern and '_module.html' in pattern:
            # http://docs.ansible.com/ansible/latest/copy_module.html
            # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html
            # http://docs.ansible.com/ansible/latest/postgresql_db_module.html
            # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html)
            # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html
            # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html)
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                pattern
            )
            #urls = [x for x in urls if '_module.html' in x]
            #if urls:
            #    import epdb; epdb.st()
            import epdb; epdb.st()
        '''

        # https://github.com/ansible/ansible/issues/19755
        if pattern == u'setup':
            pattern = u'system/setup.py'

        if u'/facts.py' in pattern or u' facts.py' in pattern:
            pattern = u'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if u'-' in pattern:
            pattern = pattern.replace(u'-', u'_')

        if u'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif u'callback' in pattern:
            return None
        elif u'lookup' in pattern:
            return None
        elif u'contrib' in pattern and u'inventory' in pattern:
            return None
        elif pattern.lower() in BLACKLIST:
            return None
        elif u'/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            if not pattern.startswith(u'lib/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + u'.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)

            if isinstance(candidate, list):
                if len(candidate) == 1:
                    candidate = candidate[0]

            if candidate[u'filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match(u'_' + bname)

        # unique the results
        if isinstance(match, list) and len(match) > 1:
            _match = []
            for m in match:
                if m not in _match:
                    _match.append(m)
            match = _match[:]

        return match

    def is_valid(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return match[u'repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        matches = []
        module_dir = os.path.join(self.gitrepo.checkoutdir, u'lib/ansible/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, _, filenames in os.walk(module_dir):
            for filename in filenames:
                if u'lib/ansible/modules' in root and not filename == u'__init__.py':
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        self.populate_modules(matches)

        # custom fixes
        newitems = []
        for k, v in six.iteritems(self.modules):

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include.py'):
                self.modules[k][u'repository'] = u'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include_vars.py'):
                self.modules[k][u'repository'] = u'ansible'
            if k.endswith(u'/include_role.py'):
                self.modules[k][u'repository'] = u'ansible'

            # ansible maintains these
            if u'include' in k:
                self.modules[k][u'maintainers'] = [u'ansible']

            # deprecated modules are annoying
            if v[u'name'].startswith(u'_'):

                dkey = os.path.dirname(v[u'filepath'])
                dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd[u'name'] = nd[u'name'].replace(u'_', u'', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        logging.debug(u'set module metadata')
        self.set_module_metadata()

        # parse imports
        logging.debug(u'set module imports')
        self.set_module_imports()

        # last modified
        if self.get_commits:
            logging.debug(u'set module commits')
            self.get_module_commits()

        # parse blame
        if self.get_blames and self.get_commits:
            logging.debug(u'set module blames')
            self.get_module_blames()

        # depends on metadata now ...
        logging.debug(u'set module maintainers')
        self.set_maintainers()

        return self.modules

    def populate_modules(self, matches):
        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict[u'filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'dirpath'] = dirpath

            filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'filepath'] = filepath

            mdict.update(
                self.split_topics_from_path(filepath)
            )

            mdict[u'repo_filename'] = mdict[u'filepath']\
                .replace(u'lib/ansible/modules/%s/' % mdict[u'repository'], u'')

            # clustering/consul
            mdict[u'namespaced_module'] = mdict[u'repo_filename']
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.py', u'')
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.ps1', u'')

            mname = os.path.basename(match)
            mname = mname.replace(u'.py', u'')
            mname = mname.replace(u'.ps1', u'')
            mdict[u'name'] = mname

            # deprecated modules
            if mname.startswith(u'_'):
                mdict[u'deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict[u'namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + u'.py')
                mdict[u'deprecated_filename'] = deprecated_filename
            else:
                mdict[u'deprecated_filename'] = mdict[u'repo_filename']

            self.modules[filepath] = mdict

        # meta is a special module
        self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules[u'meta'][u'name'] = u'meta'
        self.modules[u'meta'][u'repo_filename'] = u'meta'

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            self.commits[k] = []
            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(
                self.scraper_cache,
                k.replace(u'/', u'_') + u'.commits.pickle'
            )

            if not os.path.isfile(pfile):
                refresh = True
            else:
                pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {}
                print(pfile)
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f, **pickle_kwargs)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info(u'refresh commit cache for %s' % k)
                cmd = u'cd %s; git log --follow %s' % (self.gitrepo.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in to_text(so).split(u'\n'):
                    if line.startswith(u'commit '):
                        commit = {
                            u'name': None,
                            u'email': None,
                            u'login': None,
                            u'hash': line.split()[-1],
                            u'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith(u'Author: '):
                        line = line.replace(u'Author: ', u'')
                        line = line.replace(u'<', u'')
                        line = line.replace(u'>', u'')
                        lparts = line.split()

                        if u'@' in lparts[-1]:
                            commit[u'email'] = lparts[-1]
                            commit[u'name'] = u' '.join(lparts[:-1])
                        else:
                            pass

                        if commit[u'email'] and \
                                u'noreply.github.com' in commit[u'email']:
                            commit[u'login'] = commit[u'email'].split(u'@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith(u'Date:'):
                        dstr = line.split(u':', 1)[1].strip()
                        dstr = u' '.join(dstr.split(u' ')[:-1])
                        ds = datetime.datetime.strptime(
                            to_text(dstr),
                            u'%a %b %d %H:%M:%S %Y'
                        )
                        commit[u'date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle_dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        if filepath in self.commits:
            return self.commits[filepath][0][u'hash']

        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.gitrepo.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        return to_text(so).strip()

    def get_module_blames(self):

        logging.debug(u'build email cache')
        emails_cache = self.session.query(Email)
        emails_cache = [(x.email, x.login) for x in emails_cache]
        self.emails_cache = dict(emails_cache)

        logging.debug(u'build blame cache')
        blame_cache = self.session.query(Blame).all()
        blame_cache = [x.file_commit for x in blame_cache]
        blame_cache = sorted(set(blame_cache))

        logging.debug(u'eval module hashes')
        changed = False
        keys = sorted(self.modules.keys())
        for k in keys:
            if k not in self.files:
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)

            if ghash in blame_cache:
                continue

            logging.debug(u'checking hash for {}'.format(k))
            res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all()
            hashes = [x.file_commit for x in res]

            if ghash not in hashes:

                logging.debug(u'hash {} not found for {}, updating blames'.format(ghash, k))

                scraper_args = [u'ansible', u'ansible', u'devel', k]
                uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*scraper_args)

                # check the emails
                for email, login in emailmap.items():
                    if email in self.emails_cache:
                        continue
                    exists = self.session.query(Email).filter_by(email=email).first()
                    if not exists:
                        logging.debug(u'insert {}:{}'.format(login, email))
                        _email = Email(email=email, login=login)
                        self.session.add(_email)
                        changed = True

                # check the blames
                for login, commits in uns.items():
                    for commit in commits:
                        kwargs = {
                            u'file_name': k,
                            u'file_commit': ghash,
                            u'author_commit': commit,
                            u'author_login': login
                        }
                        exists = self.session.query(Blame).filter_by(**kwargs).first()
                        if not exists:
                            logging.debug(u'insert {}:{}:{}'.format(k, commit, login))
                            _blame = Blame(**kwargs)
                            self.session.add(_blame)
                            changed = True

        if changed:
            self.session.commit()
            logging.debug(u're-build email cache')
            emails_cache = self.session.query(Email)
            emails_cache = [(x.email, x.login) for x in emails_cache]
            self.emails_cache = dict(emails_cache)

        # fill in what we can ...
        logging.debug(u'fill in commit logins')
        for k in keys:
            for idc, commit in enumerate(self.commits[k][:]):
                if not commit.get(u'login'):
                    continue
                login = self.emails_cache.get(commit[u'email'])
                if not login and u'@users.noreply.github.com' in commit[u'email']:
                    login = commit[u'email'].split(u'@')[0]
                    self.emails_cache[commit[u'email']] = login
                if not login:
                    print(u'unknown: {}'.format(commit[u'email']))
                self.commits[k][idc][u'login'] = self.emails_cache.get(login)

    def get_emails_by_login(self, login):
        res = self.session.query(Email).filter_by(login=login)
        emails = [x.email for x in res.values()]
        return emails

    def _get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        for k in keys:

            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(
                self.scraper_cache,
                k.replace(u'/', u'_') + u'.blame.pickle'
            )
            sargs = [u'ansible', u'ansible', u'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                logging.debug(u'load {}'.format(pfile))
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f)
                if C.DEFAULT_BREAKPOINTS:
                    logging.error(u'breakpoint!')
                    import epdb; epdb.st()
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                    if len(pdata) == 3:
                        # use emailmap if available
                        emailmap = pdata[2]
                    else:
                        emailmap = {}
                else:
                    refresh = True

            if refresh:
                if self.gqlc:
                    logging.debug(u'graphql blame usernames {}'.format(pfile))
                    uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*sargs)
                else:
                    emailmap = {}  # scrapping: emails not available
                    logging.debug(u'www blame usernames {}'.format(pfile))
                    uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle_dump((ghash, uns, emailmap), f)

            for email, github_id in emailmap.items():
                if email not in self.emails_cache:
                    self.emails_cache[email] = github_id

        # add scraped logins to the map
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if x[u'email'] in [u'@']:
                    continue
                if x[u'email'] not in self.emails_cache:
                    self.emails_cache[x[u'email']] = None
                if x[u'login']:
                    self.emails_cache[x[u'email']] = x[u'login']
                    continue

                xhash = x[u'hash']
                for ck, cv in six.iteritems(self.committers[k]):
                    if xhash in cv:
                        self.emails_cache[x[u'email']] = ck
                        break

        # fill in what we can ...
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if not x[u'login']:
                    if x[u'email'] in [u'@']:
                        continue
                    if self.emails_cache[x[u'email']]:
                        login = self.emails_cache[x[u'email']]
                        xhash = x[u'hash']
                        self.commits[k][idx][u'login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''

        # grep the authors:
        for k, v in six.iteritems(self.modules):
            if v[u'filepath'] is None:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k][u'authors'] = authors

            # authors are maintainers by -default-
            self.modules[k][u'maintainers'] += authors
            self.modules[k][u'maintainers'] = \
                sorted(set(self.modules[k][u'maintainers']))

        metadata = self.botmeta[u'files'].keys()
        for k, v in six.iteritems(self.modules):
            if k == u'meta':
                continue

            if k in self.botmeta[u'files']:
                # There are metadata in .github/BOTMETA.yml for this file
                # copy maintainers_keys
                self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][k][u'maintainers_keys'][:]

                if self.botmeta[u'files'][k]:
                    maintainers = self.botmeta[u'files'][k].get(u'maintainers', [])

                    for maintainer in maintainers:
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    if u'ignored' in self.botmeta[u'files'][k]:
                        ignored = self.botmeta[u'files'][k][u'ignored']
                        for x in ignored:
                            if x in self.modules[k][u'maintainers']:
                                self.modules[k][u'maintainers'].remove(x)

            else:
                # There isn't metadata in .github/BOTMETA.yml for this file
                best_match = None
                for mkey in metadata:
                    if v[u'filepath'].startswith(mkey):
                        if not best_match:
                            best_match = mkey
                            continue
                        if len(mkey) > len(best_match):
                            best_match = mkey
                if best_match:
                    self.modules[k][u'maintainers_keys'] = [best_match]
                    for maintainer in self.botmeta[u'files'][best_match].get(u'maintainers', []):
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    for ignored in self.botmeta[u'files'][best_match].get(u'ignored', []):
                        if ignored in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].remove(ignored)

            # save a pristine copy so that higher level code can still use it
            self.modules[k][u'maintainers'] = sorted(set(self.modules[k][u'maintainers']))
            self.modules[k][u'_maintainers'] = \
                [x for x in self.modules[k][u'maintainers']]

        # set the namespace maintainers ...
        for k, v in six.iteritems(self.modules):
            if u'namespace_maintainers' not in self.modules[k]:
                self.modules[k][u'namespace_maintainers'] = []
            if v.get(u'namespace'):
                ns = v.get(u'namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k][u'namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace(u'lib/ansible/modules/', u'')
        path_parts = subpath.split(u'/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = u'/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            u'fulltopic': fulltopic,
            u'namespace': fulltopic,
            u'topic': topic,
            u'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        if not os.path.exists(module_file):
            return []

        documentation = b''
        inphase = False

        with io.open(module_file, 'rb') as f:
            for line in f:
                if b'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith((b"'''", b'"""')):
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return []

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = u''
        doc_lines = to_text(documentation).split(u'\n')
        for idx, x in enumerate(doc_lines):
            if x.startswith(u'author'):
                inphase = True
            if inphase and not x.strip().startswith((u'-', u'author')):
                inphase = False
                break
            if inphase:
                author_lines += x + u'\n'

        if not author_lines:
            return []

        ydata = {}
        try:
            ydata = yaml.load(author_lines, BotYAMLLoader)
        except Exception as e:
            print(e)
            return []

        # quit early if the yaml was not valid
        if not ydata:
            return []

        # quit if the key was not found
        if u'author' not in ydata:
            return []

        if not isinstance(ydata[u'author'], list):
            ydata[u'author'] = [ydata[u'author']]

        authors = []
        for author in ydata[u'author']:
            github_ids = self.extract_github_id(author)
            if github_ids:
                authors.extend(github_ids)
        return authors

    def extract_github_id(self, author):
        authors = set()

        if author is None:
            return []
        if u'ansible core team' in author.lower():
            authors.add(u'ansible')
        elif u'@' in author:
            # match github ids but not emails
            authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author))
        elif u'github.com/' in author:
            # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
            idx = author.find(u'github.com/')
            author = author[idx+11:]
            authors.add(author.replace(u')', u''))
        elif u'(' in author and len(author.split()) == 3:
            # Mathieu Bultel (matbu)
            idx = author.find(u'(')
            author = author[idx+1:]
            authors.add(author.replace(u')', u''))

        # search for emails
        for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author):
            github_id = self.emails_cache.get(email)
            if github_id:
                authors.add(github_id)

        return list(authors)

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        logging.debug(u'fuzzy match {}'.format(
            to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))
        )

        if component.lower() == u'core':
            return None

        # https://github.com/ansible/ansible/issues/18179
        if u'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if u'module_utils' in component:
            return None

        if u'new module' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith(u's'):
            tm = self.find_match(component[:-1])
            if tm:
                if not isinstance(tm, list):
                    return tm[u'name']
                elif len(tm) == 1:
                    return tm[0][u'name']
                else:
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb; epdb.st()

        match = None
        known_modules = []

        for k, v in six.iteritems(self.modules):
            if v[u'name'] in [u'include']:
                continue
            known_modules.append(v[u'name'])

        title = title.lower()
        title = title.replace(u':', u'')
        title_matches = [x for x in known_modules if x + u' module' in title]

        if not title_matches:
            title_matches = [x for x in known_modules
                             if title.startswith(x + u' ')]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if u' ' + x + u' ' in title]

            if title_matches:
                title_matches = [x for x in title_matches if x != u'at']

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not u'_' + x in component]

        # globs
        if not cmatches and u'*' in component:
            fmatches = [x for x in known_modules if fnmatch.fnmatch(x, component)]
            if fmatches:
                cmatches = fmatches[:]

        if title_matches:
            # use title ... ?
            cmatches = [x for x in cmatches if x in title_matches and x not in [u'at']]

        if cmatches:
            if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component):
                match = cmatches[0]
            else:
                match = cmatches[:]
            if not match:
                if u'docs.ansible.com' in component:
                    pass
                else:
                    pass
            logging.debug("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                logging.debug("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split(u'\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith(u'*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split(u'\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith(u'*'):
                thiskey = line.replace(u'*', u'')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            if not mfile.endswith(u'.py'):
                # metadata is only the .py files ...
                ext = mfile.split(u'.')[-1]
                mfile = mfile.replace(u'.' + ext, u'.py', 1)

            self.modules[k][u'metadata'].update(self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = u''
        inphase = False
        with io.open(module_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith(u'ANSIBLE_METADATA'):
                    inphase = True
                if line.startswith(u'DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
            tmp_meta = {}
            for k, v in meta.items():
                if isinstance(k, six.binary_type):
                    k = to_text(k)
                if isinstance(v, six.binary_type):
                    v = to_text(v)
                if isinstance(v, list):
                    tmp_list = []
                    for i in v:
                        if isinstance(i, six.binary_type):
                            i = to_text(i)
                        tmp_list.append(i)
                    v = tmp_list
                    del tmp_list
                tmp_meta[k] = v
            meta = tmp_meta
            del tmp_meta
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            self.modules[k][u'imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):
        mimports = []

        if not os.path.isfile(module_file):
            return mimports

        else:
            with open(module_file, 'rb') as f:
                for line in f:
                    line = line.strip()
                    line = line.replace(b',', b'')
                    if line.startswith(b'import') or \
                            (b'import' in line and b'from' in line):
                        lparts = line.split()
                        if line.startswith(b'import '):
                            mimports.append(lparts[1])
                        elif line.startswith(b'from '):
                            mpath = lparts[1] + b'.'
                            for spath in lparts[3:]:
                                mimports.append(mpath + spath)

            return [to_text(m) for m in mimports]

    @property
    def all_maintainers(self):
        maintainers = set()
        for path, metadata in self.botmeta[u'files'].items():
            maintainers.update(metadata.get(u'maintainers', []))
        return maintainers

    @property
    def all_authors(self):
        authors = set()
        for key, metadata in self.modules.items():
            authors.update(metadata.get(u'authors', []))
        return authors

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k, v in self.modules.items():
            if u'namespace' not in v or u'maintainers' not in v:
                continue
            if v[u'namespace'] == namespace:
                for m in v[u'maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != u'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist

    def get_file_content(self, filepath):
        fpath = os.path.join(self.gitrepo.checkoutdir, filepath)
        if not os.path.isfile(fpath):
            return None
        with io.open(fpath, 'r', encoding='utf-8') as f:
            data = f.read()
        return data
Example #12
0
    def _collect_repo(self, repo, issuenums=None):
        '''Collect issues for an individual repo'''
        logging.info('getting repo obj for %s' % repo)
        if repo not in self.repos:
            gitrepo = GitRepoWrapper(
                cachedir=self.cachedir_base,
                repo=f'https://github.com/{repo}',
                commit=self.args.ansible_commit,
            )
            self.repos[repo] = {
                'repo': self.ghw.get_repo(repo),
                'issues': [],
                'processed': [],
                'since': None,
                'stale': [],
                'loopcount': 0,
                'labels': self.ghw.get_valid_labels(repo),
                'gitrepo': gitrepo,
            }
        else:
            # force a clean repo object to limit caching problems
            logging.info('updating repo')
            self.repos[repo]['repo'] = self.ghw.get_repo(repo)
            logging.info('updating checkout')
            self.repos[repo]['gitrepo'].update()

            # clear the issues
            self.repos[repo]['issues'] = {}
            # increment the loopcount
            self.repos[repo]['loopcount'] += 1

        logging.info('getting issue objs for %s' % repo)
        self.update_issue_summaries(repopath=repo, issuenums=issuenums)

        issuecache = {}
        numbers = self.issue_summaries[repo].keys()
        numbers = {int(x) for x in numbers}
        if issuenums:
            numbers.intersection_update(issuenums)
            numbers = list(numbers)
        logging.info('%s known numbers' % len(numbers))

        if self.args.daemonize:

            if not self.repos[repo]['since']:
                ts = [
                    x[1]['updated_at']
                    for x in self.issue_summaries[repo].items()
                    if x[1]['updated_at']
                ]
                ts += [
                    x[1]['created_at']
                    for x in self.issue_summaries[repo].items()
                    if x[1]['created_at']
                ]
                ts = sorted(set(ts))
                if ts:
                    self.repos[repo]['since'] = ts[-1]
            else:
                since = strip_time_safely(self.repos[repo]['since'])
                api_since = self.repos[repo]['repo'].get_issues(since=since)

                numbers = []
                for x in api_since:
                    numbers.append(x.number)
                    issuecache[x.number] = x

                numbers = sorted({int(n) for n in numbers})
                logging.info('%s numbers after [api] since == %s' %
                             (len(numbers), since))

                for k, v in self.issue_summaries[repo].items():
                    if v['created_at'] is None:
                        # issue is closed and was never processed
                        continue

                    if v['created_at'] > self.repos[repo]['since']:
                        numbers.append(k)

                numbers = sorted({int(n) for n in numbers})
                logging.info('%s numbers after [www] since == %s' %
                             (len(numbers), since))

        if self.args.start_at and self.repos[repo]['loopcount'] == 0:
            numbers = [x for x in numbers if x <= self.args.start_at]
            logging.info('%s numbers after start-at' % len(numbers))

        # Get stale numbers if not targeting
        if self.args.daemonize and self.repos[repo]['loopcount'] > 0:
            logging.info('checking for stale numbers')
            stale = self.get_stale_numbers(repo)
            self.repos[repo]['stale'] = [int(x) for x in stale]
            numbers += [int(x) for x in stale]
            numbers = sorted(set(numbers))
            logging.info('%s numbers after stale check' % len(numbers))

        ################################################################
        # PRE-FILTERING TO PREVENT EXCESSIVE API CALLS
        ################################################################

        # filter just the open numbers
        if not self.args.only_closed and not self.args.ignore_state:
            numbers = [
                x for x in numbers
                if (to_text(x) in self.issue_summaries[repo] and
                    self.issue_summaries[repo][to_text(x)]['state'] == 'open')
            ]
            logging.info('%s numbers after checking state' % len(numbers))

        # filter by type
        if self.args.only_issues:
            numbers = [
                x for x in numbers
                if self.issue_summaries[repo][to_text(x)]['type'] == 'issue'
            ]
            logging.info('%s numbers after checking type' % len(numbers))
        elif self.args.only_prs:
            numbers = [
                x for x in numbers if self.issue_summaries[repo][to_text(x)]
                ['type'] == 'pullrequest'
            ]
            logging.info('%s numbers after checking type' % len(numbers))

        numbers = sorted({int(x) for x in numbers})
        if self.args.sort == 'desc':
            numbers = [x for x in reversed(numbers)]

        if self.args.last and len(numbers) > self.args.last:
            numbers = numbers[0 - self.args.last:]

        # Use iterator to avoid requesting all issues upfront
        self.repos[repo]['issues'] = RepoIssuesIterator(
            self.repos[repo]['repo'], numbers, issuecache=issuecache)

        logging.info('getting repo objs for %s complete' % repo)
Example #13
0
    def __init__(self, repo, cachedir='/tmp'):
        self.cachedir = cachedir
        if not os.path.isdir(self.cachedir):
            os.makedirs(self.cachedir)

        self.gitrepo = GitRepoWrapper(cachedir=self.cachedir, repo=repo)