Beispiel #1
0
    def get_authors(self,
                    mails=[],
                    repos=[],
                    fromdate=None,
                    todate=None,
                    merge_commit=None,
                    metadata=[],
                    mails_neg=False,
                    domains=None,
                    blacklisted_mails=None):
        """ Return the author emails (removed duplicated) also
        this return the amount of hits for a given unique
        author_email. The hits value is the amount of commits
        for a given email.
        """
        params = {'index': self.index, 'doc_type': self.dbname}

        body = {
            "query": {
                "bool": {
                    "filter":
                    self.get_filter(mails, repos, metadata, mails_neg, domains,
                                    blacklisted_mails),
                }
            },
            "aggs": {
                "authors": {
                    "terms": {
                        "field": "author_email",
                        "order": {
                            "_count": "desc"
                        },
                        "size": 1000000
                    }
                }
            }
        }

        body["query"]["bool"]["filter"]["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        if merge_commit is not None:
            body["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        params['body'] = body
        params['size'] = 0
        params = clean_empty(params)
        res = self.es.search(**params)
        took = res['took']
        res = [(b['key'], b['doc_count'])
               for b in res["aggregations"]["authors"]["buckets"]]
        return took, dict(res)
Beispiel #2
0
    def get_tags(self, repos, fromdate=None, todate=None):

        qfilter = {
            "bool": {
                "must": [],
                "should": [],
            }
        }

        for repo in repos:
            should_repo_clause = {"bool": {"must": []}}
            should_repo_clause["bool"]["must"].append({"term": {"repo": repo}})
            qfilter["bool"]["should"].append(should_repo_clause)

        qfilter["bool"]["must"].append(
            {"range": {
                "date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        body = {"query": {"bool": {"filter": qfilter}}}

        body = clean_empty(body)

        return [
            t for t in scanner(
                self.es, query=body, index=self.index, doc_type=self.dbname)
        ]
Beispiel #3
0
    def get_field_stats(self,
                        field,
                        mails=[],
                        repos=[],
                        fromdate=None,
                        todate=None,
                        merge_commit=None,
                        metadata=[],
                        mails_neg=False,
                        domains=None,
                        blacklisted_mails=None):
        """ Return the stats about the specified field for authors and/or repos.
        """
        params = {'index': self.index, 'doc_type': self.dbname}

        body = {
            "query": {
                "bool": {
                    "filter":
                    self.get_filter(mails, repos, metadata, mails_neg, domains,
                                    blacklisted_mails),
                }
            },
            "aggs": {
                "%s_stats" % field: {
                    "stats": {
                        "field": field
                    }
                }
            }
        }

        body["query"]["bool"]["filter"]["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        if merge_commit is not None:
            body["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        params['body'] = body
        params['size'] = 0
        params = clean_empty(params)
        res = self.es.search(**params)
        took = res['took']
        return took, res["aggregations"]["%s_stats" % field]
Beispiel #4
0
 def subreq(mails):
     request = []
     for email in mails:
         req_head = {'index': self.index, 'type': self.dbname}
         req_body = {
             'query': {
                 'term': {
                     'author_email': email
                 }
             },
             'size': 1,
             '_source': ["author_email", "author_name"]
         }
         request.extend([req_head, req_body])
     request = clean_empty(request)
     resp = self.es.msearch(body=request)
     return resp
Beispiel #5
0
    def get_commits_amount(self,
                           mails=[],
                           repos=[],
                           fromdate=None,
                           todate=None,
                           merge_commit=None,
                           metadata=[],
                           mails_neg=False,
                           domains=None,
                           blacklisted_mails=None):
        """ Return the amount of commits for authors and/or repos.
        """
        params = {'index': self.index, 'doc_type': self.dbname}

        body = {
            "query": {
                "bool": {
                    "filter":
                    self.get_filter(mails, repos, metadata, mails_neg, domains,
                                    blacklisted_mails),
                }
            }
        }

        body["query"]["bool"]["filter"]["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        if merge_commit is not None:
            body["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        params['body'] = body
        params = clean_empty(params)
        res = self.es.count(**params)
        return res['count']
Beispiel #6
0
    def get_authors_histo(self,
                          mails=[],
                          repos=[],
                          fromdate=None,
                          todate=None,
                          merge_commit=None,
                          metadata=[],
                          mails_neg=False,
                          domains=None,
                          blacklisted_mails=None):
        """ Return the histogram of authors for authors and/or repos.
        """
        params = {'index': self.index, 'doc_type': self.dbname}

        qfilter = self.get_filter(mails, repos, metadata, mails_neg, domains,
                                  blacklisted_mails)
        duration = self.get_commits_time_delta(
            mails,
            repos,
            fromdate=fromdate,
            todate=todate,
            metadata=metadata,
            mails_neg=mails_neg,
            domains=domains,
            blacklisted_mails=blacklisted_mails)[2]

        res = self.set_histo_granularity(duration)

        body = {
            "query": {
                "bool": {
                    "filter": qfilter,
                }
            },
            "aggs": {
                "commits": {
                    "date_histogram": {
                        "field": "committer_date",
                        "interval": res,
                        "format": "yyyy-MM-dd",
                    },
                    "aggs": {
                        "authors_email": {
                            "terms": {
                                "field": "author_email",
                                "size": 1000000
                            },
                        }
                    }
                }
            }
        }

        body["query"]["bool"]["filter"]["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        if merge_commit is not None:
            body["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        params['body'] = body
        params['size'] = 0
        params = clean_empty(params)
        res = self.es.search(**params)
        took = res['took']
        res = res["aggregations"]["commits"]["buckets"]
        for bucket in res:
            bucket['authors_email'] = [
                b['key'] for b in bucket['authors_email']['buckets']
            ]
            bucket['doc_count'] = len(bucket['authors_email'])
        return took, res
Beispiel #7
0
    def get_top_field_by_lines(self,
                               field,
                               mails=[],
                               repos=[],
                               fromdate=None,
                               todate=None,
                               merge_commit=None,
                               metadata=[],
                               mails_neg=False,
                               domains=None,
                               blacklisted_mails=None):
        """ Return the ranking of field by lines changed
        """
        params = {'index': self.index, 'doc_type': self.dbname}

        body = {
            "query": {
                "bool": {
                    "filter":
                    self.get_filter(mails, repos, metadata, mails_neg, domains,
                                    blacklisted_mails),
                }
            },
            "aggs": {
                "top-field-by-modified": {
                    "terms": {
                        "field": field,
                        "size": 1000000,
                    },
                    "aggs": {
                        "modified": {
                            "sum": {
                                "field": "line_modifieds",
                            },
                        }
                    }
                }
            }
        }

        body["query"]["bool"]["filter"]["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        if merge_commit is not None:
            body["query"]["bool"]["filter"]["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        params['body'] = body
        params['size'] = 0
        params = clean_empty(params)
        res = self.es.search(**params)
        took = res['took']
        top = [(b['key'], b['modified']['value'])
               for b in res["aggregations"]["top-field-by-modified"]["buckets"]
               ]
        return took, dict(top)
Beispiel #8
0
    def get_commits(self,
                    mails=[],
                    repos=[],
                    fromdate=None,
                    todate=None,
                    start=0,
                    limit=100,
                    sort='desc',
                    scan=False,
                    merge_commit=None,
                    metadata=[],
                    mails_neg=False,
                    domains=None,
                    blacklisted_mails=None):
        """ Return the list of commits for authors and/or repos.
        """

        params = {'index': self.index, 'doc_type': self.dbname}

        qfilter = self.get_filter(mails, repos, metadata, mails_neg, domains,
                                  blacklisted_mails)

        # If None both are returned. If you expect to skip merge commits
        # then set merge_commit to False
        if merge_commit is not None:
            qfilter["bool"]["must"].append(
                {"term": {
                    "merge_commit": merge_commit
                }})

        qfilter["bool"]["must"].append(
            {"range": {
                "committer_date": {
                    "gte": fromdate,
                    "lt": todate,
                }
            }})

        body = {
            "query": {
                "bool": {
                    "filter": qfilter,
                }
            }
        }

        if scan:
            return scanner(self.es,
                           query=body,
                           index=self.index,
                           doc_type=self.dbname)

        params['body'] = body
        params['size'] = limit
        params['from_'] = start
        params['sort'] = "committer_date:%s,author_date:%s" % (sort, sort)
        params = clean_empty(params)
        res = self.es.search(**params)
        took = res['took']
        hits = res['hits']['total']
        if isinstance(hits, dict) and 'value' in hits:
            hits = hits.get('value')
        commits = [r['_source'] for r in res['hits']['hits']]
        return took, hits, commits
Beispiel #9
0
    def get_filter(self,
                   mails,
                   repos,
                   metadata,
                   mails_neg=False,
                   domains=None,
                   blacklisted_mails=None):
        """ Compute the search filter
        """
        if isinstance(mails, list):
            mails = dict([(mail, None) for mail in mails])
        if isinstance(repos, list):
            repos = dict([(repo, None) for repo in repos])
        if not domains:
            domains = []

        filter = {
            "bool": {
                "must": [],
                "must_not": [],
            }
        }

        must_mail_clause = {"bool": {"should": [], "must_not": []}}

        for mail, date_bounces in mails.items():
            must = {"bool": {"must": []}}
            must["bool"]["must"].append({"term": {"author_email": mail}})
            if date_bounces:
                date_clause = {
                    "range": {
                        "committer_date": {
                            "gte": date_bounces.get('begin-date'),
                            "lt": date_bounces.get('end-date')
                        }
                    }
                }
                must["bool"]["must"].append(date_clause)
            if mails_neg:
                must_mail_clause["bool"]["must_not"].append(must)
            else:
                must_mail_clause["bool"]["should"].append(must)

        for domain in domains:
            clause = {"bool": {"must": []}}
            clause["bool"]["must"].append(
                {"term": {
                    "author_email_domain": domain
                }})
            if mails_neg:
                must_mail_clause["bool"]["must_not"].append(clause)
            else:
                must_mail_clause["bool"]["should"].append(clause)

        filter["bool"]["must"].append(must_mail_clause)

        must_project_clause = {"bool": {"should": []}}
        for repo, paths in repos.items():
            repo_clause = {"bool": {"must": []}}
            repo_clause["bool"]["must"].append({
                "bool": {
                    "must": [{
                        "term": {
                            "repos": repo
                        }
                    }],
                    "should": [],
                    "filter": [],
                }
            })
            if paths:
                repo_clause["bool"]["must"][0]["bool"]["filter"].append(
                    {"terms": {
                        "files_list": []
                    }})
                for path in paths:
                    repo_clause["bool"]["must"][0]["bool"]["filter"][0][
                        "terms"]["files_list"].append(path)

            must_project_clause["bool"]["should"].append(repo_clause)

        filter["bool"]["must"].append(must_project_clause)

        must_metadata_clause = {"bool": {"should": []}}
        for key, value in metadata:
            if value is None:
                must_metadata_clause["bool"]["should"].append(
                    {"exists": {
                        "field": key
                    }})
            else:
                must_metadata_clause["bool"]["should"].append(
                    {"term": {
                        key: value
                    }})
        filter["bool"]["must"].append(must_metadata_clause)

        # Exclude commits from 1970-01-01
        boggus_date_clause = {
            "range": {
                "committer_date": {
                    "gte": 86401,
                }
            }
        }
        filter["bool"]["must"].append(boggus_date_clause)

        if blacklisted_mails:
            for mail in blacklisted_mails:
                filter["bool"]["must_not"].append(
                    {"term": {
                        "author_email": mail
                    }})

        return clean_empty(filter)