コード例 #1
0
def __init_index(elastic_url, index, wait_time):
    mapping = Mapping

    while True:
        try:
            elastic = ElasticSearch(elastic_url, index, mappings=mapping)
            break
        except Exception as e:
            logging.info("Index %s not ready: %s", ARGS.index, str(e))
            time.sleep(wait_time)

    return elastic
コード例 #2
0
    def test_get_last_date(self):
        """Test whether the last date is correctly returned"""

        items = json.loads(read_file('data/git.json'))
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)
        new_items = elastic.bulk_upload(items, field_id="uuid")
        self.assertEqual(new_items, 11)

        # no filter
        last_date = elastic.get_last_date('updated_on')
        self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00')

        # filter including all items
        fltr = {
            'name': 'origin',
            'value': '/tmp/perceval_mc84igfc/gittest'
        }
        last_date = elastic.get_last_date('updated_on', filters_=[fltr])
        self.assertEqual(last_date.isoformat(), '2014-02-12T06:11:12+00:00')

        # filter including a sub-set og items
        fltr = {
            'name': 'perceval_version',
            'value': '0.9.11'
        }
        last_date = elastic.get_last_date('updated_on', filters_=[fltr])
        self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
コード例 #3
0
    def test_get_last_offset(self):
        """Test whether the last offset is correctly returned"""

        items = json.loads(read_file('data/kitsune.json'))
        elastic = ElasticSearch(self.es_con, self.target_index, KitsuneOcean.mapping)
        new_items = elastic.bulk_upload(items, field_id="uuid")
        self.assertEqual(new_items, 4)

        # no filter
        last_offset = elastic.get_last_offset('offset')
        self.assertEqual(last_offset, 3)

        # filter including all items
        fltr = {
            'name': 'origin',
            'value': 'http://example.com'
        }
        last_offset = elastic.get_last_offset('offset', filters_=[fltr])
        self.assertEqual(last_offset, 3)

        # filter including a sub-set og items
        fltr = {
            'name': 'perceval_version',
            'value': '0.9.11'
        }
        last_offset = elastic.get_last_offset('offset', filters_=[fltr])
        self.assertEqual(last_offset, 1)
コード例 #4
0
ファイル: kidash.py プロジェクト: iht/grimoirelab-kidash
def create_index_pattern(elastic_url, dashboard, enrich_index, es_index=None):
    """ Create a index pattern using as template the index pattern
        in dashboard template vis

        :param elastic_url: URL for ElasticSearch (ES) server
        :param dashboard: kibana dashboard to be used as template
        :param enrich_index: ES enriched index used in the new dashboard

    """

    index_pattern = None
    if not es_index:
        es_index = ".kibana"
    elastic = ElasticSearch(elastic_url, es_index)

    dash_data = get_dashboard_json(elastic, dashboard)

    # First vis
    if "panelsJSON" not in dash_data:
        logger.error("Can not find vis in dashboard: %s", dashboard)
        raise

    # Get the index pattern from the first vis in the panel
    # that as index pattern data
    for panel in json.loads(dash_data["panelsJSON"]):
        panel_id = panel["id"]
        logger.debug("Checking index pattern in %s vis", panel_id)

        index_pattern = get_index_pattern_from_vis(elastic, panel_id)
        if index_pattern:
            break

    # And now time to create the index pattern found
    if not index_pattern:
        logger.error("Can't find index pattern for %s", dashboard)
        raise

    logger.debug("Found %s template index pattern", index_pattern)

    new_index_pattern_json = get_index_pattern_json(elastic, index_pattern)

    new_index_pattern_json['title'] = enrich_index
    url = elastic.index_url + "/index-pattern/" + enrich_index
    headers = {"Content-Type": "application/json"}
    res = requests_ses.post(url,
                            data=json.dumps(new_index_pattern_json),
                            verify=False,
                            headers=headers)
    res.raise_for_status()
    logger.debug("New index pattern created: %s", url)

    return enrich_index
コード例 #5
0
ファイル: test_elastic.py プロジェクト: alpgarcia/GrimoireELK
    def test_safe_put_bulk_errors(self):
        """Test whether an error message is logged when an item isn't inserted"""

        items = json.loads(read_file('data/git.json'))
        data_json = items[0]
        data_json['origin'] = ''.join(
            random.choice(string.ascii_letters) for x in range(66000))
        bulk_json = '{{"index" : {{"_id" : "{}" }} }}\n'.format(
            data_json['uuid'])
        bulk_json += json.dumps(data_json) + "\n"

        elastic = ElasticSearch(self.es_con, self.target_index,
                                GitOcean.mapping)
        bulk_url = elastic.get_bulk_url()

        with self.assertLogs(logger, level='ERROR') as cm:
            inserted_items = elastic.safe_put_bulk(bulk_url, bulk_json)
            self.assertRegex(
                cm.output[0],
                "ERROR:grimoire_elk.elastic:Failed to insert data to ES*")

        self.assertEqual(inserted_items, 0)
コード例 #6
0
def export_items(elastic_url, in_index, out_index, elastic_url_out=None,
                 search_after=False, search_after_value=None, limit=None,
                 copy=False):
    """ Export items from in_index to out_index using the correct mapping """

    if not limit:
        limit = DEFAULT_LIMIT

    if search_after_value:
        search_after_value_timestamp = int(search_after_value[0])
        search_after_value_uuid = search_after_value[1]
        search_after_value = [search_after_value_timestamp, search_after_value_uuid]

    logging.info("Exporting items from %s/%s to %s", elastic_url, in_index, out_index)

    count_res = requests.get('%s/%s/_count' % (elastic_url, in_index))
    try:
        count_res.raise_for_status()
    except requests.exceptions.HTTPError:
        if count_res.status_code == 404:
            logging.error("The index does not exists: %s", in_index)
        else:
            logging.error(count_res.text)
        sys.exit(1)

    logging.info("Total items to copy: %i", count_res.json()['count'])

    # Time to upload the items with the correct mapping
    elastic_in = ElasticSearch(elastic_url, in_index)
    if not copy:
        # Create the correct mapping for the data sources detected from in_index
        ds_mapping = find_mapping(elastic_url, in_index)
    else:
        logging.debug('Using the input index mapping')
        ds_mapping = extract_mapping(elastic_url, in_index)

    if not elastic_url_out:
        elastic_out = ElasticSearch(elastic_url, out_index, mappings=ds_mapping)
    else:
        elastic_out = ElasticSearch(elastic_url_out, out_index, mappings=ds_mapping)

    # Time to just copy from in_index to our_index
    uid_field = find_uuid(elastic_url, in_index)
    backend = find_perceval_backend(elastic_url, in_index)
    if search_after:
        total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit,
                                              search_after_value, scroll=False), uid_field)
    else:
        total = elastic_out.bulk_upload(fetch(elastic_in, backend, limit), uid_field)

    logging.info("Total items copied: %i", total)
コード例 #7
0
    def test_init_duplicated_aliases(self):
        """Test whether duplicated aliases are ignored"""

        expected_aliases = {
            'A': {},
            'B': {}
        }

        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping, aliases=["A", "B", "A"])
        self.assertEqual(elastic.url, self.es_con)
        self.assertEqual(elastic.index, self.target_index)
        self.assertEqual(elastic.index_url, self.es_con + "/" + self.target_index)

        r = elastic.requests.get(elastic.index_url + '/_alias')
        aliases = r.json()[self.target_index]['aliases']

        self.assertDictEqual(aliases, expected_aliases)
コード例 #8
0
    def test_fetch_filter_raw(self):
        """Test whether the fetch with filter raw properly works"""

        perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo')
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)

        # Load items
        items = json.loads(read_file('data/git.json'))
        ocean = GitOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        eitems = ElasticItems(perceval_backend)
        eitems.set_filter_raw("data.commit:87783129c3f00d2c81a3a8e585eb86a47e39891a")
        eitems.elastic = elastic
        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 1)
コード例 #9
0
ファイル: kidash.py プロジェクト: acs/grimoirelab-kidash
def feed_dashboard(dashboard,
                   elastic_url,
                   es_index=None,
                   data_sources=None,
                   add_vis_studies=False):
    """ Import a dashboard. If data_sources are defined, just include items
        for this data source.
    """

    if not es_index:
        es_index = ".kibana"

    elastic = ElasticSearch(elastic_url, es_index)

    if 'dashboard' in dashboard:
        import_item_json(elastic, "dashboard", dashboard['dashboard']['id'],
                         dashboard['dashboard']['value'], data_sources,
                         add_vis_studies)

    if 'searches' in dashboard:
        for search in dashboard['searches']:
            import_item_json(elastic, "search", search['id'], search['value'],
                             data_sources)

    if 'index_patterns' in dashboard:
        for index in dashboard['index_patterns']:
            if not data_sources or \
                    is_index_pattern_from_data_sources(index, data_sources):
                import_item_json(elastic, "index-pattern", index['id'],
                                 index['value'])
            else:
                logger.debug("Index pattern %s not for %s. Not included.",
                             index['id'], data_sources)

    if 'visualizations' in dashboard:
        for vis in dashboard['visualizations']:
            if not add_vis_studies and is_vis_study(vis):
                logger.debug("Vis %s is for an study. Not included.",
                             vis['id'])
            elif not data_sources or is_vis_from_data_sources(
                    vis, data_sources):
                import_item_json(elastic, "visualization", vis['id'],
                                 vis['value'])
            else:
                logger.debug("Vis %s not for %s. Not included.", vis['id'],
                             data_sources)
コード例 #10
0
    def test_fetch(self):
        """Test whether the fetch method properly works"""

        perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo')
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)

        # Load items
        items = json.loads(read_file('data/git.json'))
        ocean = GitOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        eitems = ElasticItems(perceval_backend)
        eitems.scroll_size = 2
        eitems.elastic = elastic

        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 9)
コード例 #11
0
    def get_rich_items(self, item):
        # The real data
        entry = item['data']

        enriched_items = []

        for file_analysis in entry["analysis"]:
            eitem = self.get_rich_item(file_analysis)

            for f in self.RAW_FIELDS_COPY:
                if f in item:
                    eitem[f] = item[f]
                else:
                    eitem[f] = None

            # common attributes
            eitem['commit_sha'] = entry['commit']
            eitem['author'] = entry['Author']
            eitem['committer'] = entry['Commit']
            eitem['message'] = entry.get('message', None)
            eitem['author_date'] = fix_field_date(entry['AuthorDate'])
            eitem['commit_date'] = fix_field_date(entry['CommitDate'])

            # Other enrichment
            eitem["repo_url"] = item["origin"]
            if eitem["repo_url"].startswith('http'):
                eitem["repo_url"] = ElasticSearch.anonymize_url(
                    eitem["repo_url"])

            if self.prjs_map:
                eitem.update(self.get_item_project(eitem))

            # uuid
            eitem['id'] = "{}_{}".format(eitem['commit_sha'],
                                         eitem['file_path'])

            eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file"))

            self.add_repository_labels(eitem)
            self.add_metadata_filter_raw(eitem)

            enriched_items.append(eitem)

        return enriched_items
コード例 #12
0
    def test_get_elastic_items(self):
        """Test whether the elastic method works properly"""

        perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo')
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)

        # Load items
        items = json.loads(read_file('data/git.json'))
        ocean = GitOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        eitems = ElasticItems(perceval_backend)
        eitems.elastic = elastic
        r_json = eitems.get_elastic_items()

        total = r_json['hits']['total']
        total = total['value'] if isinstance(total, dict) else total
        self.assertEqual(total, 9)
コード例 #13
0
ファイル: kidash.py プロジェクト: iht/grimoirelab-kidash
def search_dashboards(elastic_url, es_index=None):

    dashboards = []

    if not es_index:
        es_index = ".kibana"

    elastic = ElasticSearch(elastic_url, es_index)
    elastic_ver = find_elasticsearch_version(elastic)

    if elastic_ver < 6:
        dash_json_url = elastic.index_url + "/dashboard/_search?size=10000"
        res = requests_ses.get(dash_json_url, verify=False)
    else:
        items_json_url = elastic.index_url + "/_search?size=10000"
        query = '''
        {
            "query" : {
                "term" : { "type" : "dashboard"  }
             }
        }'''
        res = requests_ses.post(items_json_url,
                                data=query,
                                verify=False,
                                headers=HEADERS_JSON)
    res.raise_for_status()

    res_json = res.json()

    if "hits" not in res_json:
        logger.error("Can't find dashboards")
        raise RuntimeError("Can't find dashboards")

    for dash in res_json["hits"]["hits"]:
        if elastic_ver < 6:
            dash_json = dash["_source"]
        else:
            dash_json = dash["_source"]["dashboard"]

        dashboards.append({"_id": dash["_id"], "title": dash_json["title"]})

    return dashboards
コード例 #14
0
    def test_get_last_item_field(self):
        """Test whether the date/offset of the last item is correctly returned"""

        items = json.loads(read_file('data/git.json'))
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)
        new_items = elastic.bulk_upload(items, field_id="uuid")
        self.assertEqual(new_items, 11)

        # no filter
        last_date = elastic.get_last_item_field('updated_on')
        self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00')

        # None filter
        last_date = elastic.get_last_item_field('updated_on', filters_=None)
        self.assertEqual(last_date.isoformat(), '2019-10-01T18:05:52+00:00')

        # Multiple filters
        fltrs = [
            {
                'name': 'origin',
                'value': '/tmp/perceval_mc84igfc/gittest'
            },
            {
                'name': 'perceval_version',
                'value': '0.9.11'
            }
        ]
        last_date = elastic.get_last_item_field('updated_on', filters_=fltrs)
        self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')

        # Handle None filter
        fltrs = [
            {
                'name': 'origin',
                'value': '/tmp/perceval_mc84igfc/gittest'
            },
            {
                'name': 'perceval_version',
                'value': '0.9.11'
            },
            None
        ]
        last_date = elastic.get_last_item_field('updated_on', filters_=fltrs)
        self.assertEqual(last_date.isoformat(), '2014-02-12T06:09:04+00:00')
コード例 #15
0
ファイル: utils.py プロジェクト: bloriot97/grimoirelab-elk
def get_elastic(url, es_index, clean=None, backend=None, es_aliases=None, mapping=None):

    analyzers = None

    if backend:
        backend.set_elastic_url(url)
#        mapping = backend.get_elastic_mappings()
        mapping = backend.mapping
        analyzers = backend.get_elastic_analyzers()
    try:
        insecure = True
        elastic = ElasticSearch(url=url, index=es_index, mappings=mapping,
                                clean=clean, insecure=insecure,
                                analyzers=analyzers, aliases=es_aliases)

    except ElasticConnectException:
        logger.error("Can't connect to Elastic Search. Is it running?")
        sys.exit(1)

    return elastic
コード例 #16
0
ファイル: test_elastic.py プロジェクト: alpgarcia/GrimoireELK
    def test_check_instance_es_major_error(self):
        """Test whether an exception is thrown when the ElasticSearch version number is not retrieved"""

        body = """{
                "name" : "Amber Hunt",
                "cluster_name" : "jgbarah",
                "version" : {
                    "build_hash" : "e3126df",
                    "build_date" : "2016-04-26T12:08:58.960Z",
                    "build_snapshot" : false,
                    "lucene_version" : "6.0.0"
                },
                "tagline" : "You Know, for Search"
            }"""

        es_con = "http://es_err.com"
        httpretty.register_uri(httpretty.GET, es_con, body=body, status=200)

        with self.assertRaises(ElasticError):
            _, _ = ElasticSearch.check_instance(es_con, insecure=False)
コード例 #17
0
    def test_fetch_no_results(self):
        """Test whether a message is logged when no results are found"""

        perceval_backend = Git('/tmp/perceval_mc84igfc/gittest-not_found', '/tmp/foo')
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)

        # Load items
        items = json.loads(read_file('data/git.json'))
        ocean = GitOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        eitems = ElasticItems(perceval_backend)
        eitems.elastic = elastic

        with self.assertLogs(logger, level='DEBUG') as cm:
            items = [ei for ei in eitems.fetch()]
            self.assertEqual(len(items), 0)
            self.assertRegex(cm.output[-2], 'DEBUG:grimoire_elk.elastic_items:No results found.*')
            self.assertRegex(cm.output[-1], 'DEBUG:grimoire_elk.elastic_items:Releasing scroll_id=*')
コード例 #18
0
ファイル: test_elastic.py プロジェクト: alpgarcia/GrimoireELK
    def test_add_aliases_duplicated(self):
        """Test whether an alias isn't added when already present in a given index"""

        elastic = ElasticSearch(self.es_con,
                                self.target_index,
                                GitOcean.mapping,
                                aliases=['A', 'B', 'C'])

        expected_aliases = {'A': {}, 'B': {}, 'C': {}}
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)

        elastic.add_alias('C')
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)
コード例 #19
0
    def test_check_instance_es_major_5(self):
        """Test whether the major version is correctly calculated for ElasticSearch 5.x"""

        body = """{
            "name" : "Amber Hunt",
            "cluster_name" : "jgbarah",
            "version" : {
                "number" : "5.0.0-alpha2",
                "build_hash" : "e3126df",
                "build_date" : "2016-04-26T12:08:58.960Z",
                "build_snapshot" : false,
                "lucene_version" : "6.0.0"
            },
            "tagline" : "You Know, for Search"
        }"""

        es_con = "http://es5.com"
        httpretty.register_uri(httpretty.GET, es_con, body=body, status=200)

        major = ElasticSearch.check_instance(es_con, insecure=False)
        self.assertEqual(major, '5')
コード例 #20
0
ファイル: kidash.py プロジェクト: iht/grimoirelab-kidash
def fetch_index_pattern(elastic_url, ip_id, es_index=None):
    """
    Fetch an index pattern JSON definition from Kibana and return it.

    :param elastic_url: Elasticsearch URL
    :param ip_id: index pattern identifier
    :param es_index: kibana index
    :return: a dict with index pattern data
    """

    logger.debug("Fetching index pattern %s", ip_id)
    if not es_index:
        es_index = ".kibana"

    elastic = ElasticSearch(elastic_url, es_index)

    ip_json = get_index_pattern_json(elastic, ip_id)

    index_pattern = {"id": ip_id, "value": ip_json}

    return index_pattern
コード例 #21
0
ファイル: test_elastic.py プロジェクト: alpgarcia/GrimoireELK
    def test_add_aliases(self):
        """Test whether an alias is added to a given index"""

        elastic = ElasticSearch(self.es_con,
                                self.target_index,
                                GitOcean.mapping,
                                aliases=['A', 'B', 'C'])

        expected_aliases = {'A': {}, 'B': {}, 'C': {}}
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)

        expected_aliases = {'A': {}, 'B': {}, 'C': {}, 'D': {}}
        elastic.add_alias('D')
        aliases = elastic.list_aliases()
        self.assertDictEqual(aliases, expected_aliases)
コード例 #22
0
    def test_delete_items_wrong_retention(self):
        """Test whether no items are deleted if retention isn't defined or negative"""

        items = json.loads(read_file('data/git.json'))
        for item in items:
            timestamp = unixtime_to_datetime(item['timestamp'])
            item['timestamp'] = timestamp.isoformat()

        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)
        new_items = elastic.bulk_upload(items, field_id="uuid")
        self.assertEqual(new_items, 11)

        url = self.es_con + '/' + self.target_index + '/_count'

        elastic.delete_items(retention_time=None, time_field='timestamp')
        left_items = elastic.requests.get(url).json()['count']
        self.assertEqual(left_items, 11)

        elastic.delete_items(retention_time=-1, time_field='timestamp')
        left_items = elastic.requests.get(url).json()['count']
        self.assertEqual(left_items, 11)
コード例 #23
0
    def test_fetch_from_offset(self):
        """Test whether the fetch method with offset properly works"""

        perceval_backend = Kitsune('http://example.com')
        elastic = ElasticSearch(self.es_con, self.target_index, KitsuneOcean.mapping)

        # Load items
        items = json.loads(read_file('data/kitsune.json'))
        ocean = KitsuneOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        # Fetch total items
        eitems = ElasticItems(perceval_backend)
        eitems.elastic = elastic
        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 4)

        # Fetch with offset
        eitems = ElasticItems(perceval_backend, offset=2)
        eitems.elastic = elastic
        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 2)
コード例 #24
0
    def test_delete_items(self):
        """Test whether items are correctly deleted"""

        items = json.loads(read_file('data/git.json'))
        for item in items:
            timestamp = unixtime_to_datetime(item['timestamp'])
            item['timestamp'] = timestamp.isoformat()

        elastic = ElasticSearch(self.es_con, self.target_index,
                                GitOcean.mapping)
        new_items = elastic.bulk_upload(items, field_id="uuid")
        self.assertEqual(new_items, 9)

        url = self.es_con + '/' + self.target_index + '/_count'

        elastic.delete_items(retention_time=90000000, time_field='timestamp')
        left_items = elastic.requests.get(url).json()['count']
        self.assertEqual(left_items, 9)

        elastic.delete_items(retention_time=1, time_field='timestamp')
        left_items = elastic.requests.get(url).json()['count']
        self.assertEqual(left_items, 0)
コード例 #25
0
    def test_check_instance_es_major_6(self):
        """Test whether the major version is correctly calculated for ElasticSearch 6.x"""

        body = """{
            "name" : "44BPNNH",
            "cluster_name" : "elasticsearch",
            "cluster_uuid" : "fIa1j8AQRfSrmuhTwb9a0Q",
            "version" : {
                "number" : "6.1.0",
                "build_hash" : "c0c1ba0",
                "build_date" : "2017-12-12T12:32:54.550Z",
                "build_snapshot" : false,
                "lucene_version" : "7.1.0",
                "minimum_wire_compatibility_version" : "5.6.0",
                "minimum_index_compatibility_version" : "5.0.0"
            },
            "tagline" : "You Know, for Search"
        }"""

        es_con = "http://es6.com"
        httpretty.register_uri(httpretty.GET, es_con, body=body, status=200)

        major = ElasticSearch.check_instance(es_con, insecure=False)
        self.assertEqual(major, '6')
コード例 #26
0
    def test_fetch_from_date(self):
        """Test whether the fetch method with from_date properly works"""

        perceval_backend = Git('/tmp/perceval_mc84igfc/gittest', '/tmp/foo')
        elastic = ElasticSearch(self.es_con, self.target_index, GitOcean.mapping)

        # Load items
        items = json.loads(read_file('data/git.json'))
        ocean = GitOcean(perceval_backend)
        ocean.elastic = elastic
        ocean.feed_items(items)

        # Fetch total items
        eitems = ElasticItems(perceval_backend)
        eitems.elastic = elastic
        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 9)

        # Fetch with from date
        from_date = str_to_datetime("2018-02-09T08:33:22.699+00:00")
        eitems = ElasticItems(perceval_backend, from_date=from_date)
        eitems.elastic = elastic
        items = [ei for ei in eitems.fetch()]
        self.assertEqual(len(items), 2)
コード例 #27
0
    def enrich_cocom_analysis(self,
                              ocean_backend,
                              enrich_backend,
                              no_incremental=False,
                              out_index="cocom_enrich_graal_repo",
                              interval_months=[3],
                              date_field="grimoire_creation_date"):

        logger.info("[cocom] study enrich-cocom-analysis start")

        es_in = ES([enrich_backend.elastic_url],
                   retry_on_timeout=True,
                   timeout=100,
                   verify_certs=self.elastic.requests.verify,
                   connection_class=RequestsHttpConnection)
        in_index = enrich_backend.elastic.index
        interval_months = list(map(int, interval_months))

        unique_repos = es_in.search(index=in_index,
                                    body=get_unique_repository())

        repositories = [
            repo['key']
            for repo in unique_repos['aggregations']['unique_repos'].get(
                'buckets', [])
        ]
        current_month = datetime_utcnow().replace(day=1,
                                                  hour=0,
                                                  minute=0,
                                                  second=0)

        logger.info(
            "[cocom] study enrich-cocom-analysis {} repositories to process".
            format(len(repositories)))
        es_out = ElasticSearch(enrich_backend.elastic.url,
                               out_index,
                               mappings=Mapping)
        es_out.add_alias("cocom_study")

        num_items = 0
        ins_items = 0

        for repository_url in repositories:
            logger.info(
                "[cocom] study enrich-cocom-analysis start analysis for {}".
                format(repository_url))
            evolution_items = []

            for interval in interval_months:

                to_month = get_to_date(es_in, in_index, out_index,
                                       repository_url, interval)
                to_month = to_month.replace(month=int(interval),
                                            day=1,
                                            hour=0,
                                            minute=0,
                                            second=0)

                while to_month < current_month:
                    files_at_time = es_in.search(
                        index=in_index,
                        body=get_files_at_time(repository_url,
                                               to_month.isoformat())
                    )['aggregations']['file_stats'].get("buckets", [])

                    if not len(files_at_time):
                        to_month = to_month + relativedelta(months=+interval)
                        continue

                    repository_name = repository_url.split("/")[-1]
                    evolution_item = {
                        "id":
                        "{}_{}_{}".format(to_month.isoformat(),
                                          repository_name, interval),
                        "origin":
                        repository_url,
                        "interval_months":
                        interval,
                        "study_creation_date":
                        to_month.isoformat(),
                        "total_files":
                        len(files_at_time)
                    }

                    for file_ in files_at_time:
                        file_details = file_["1"]["hits"]["hits"][0]["_source"]

                        for metric in self.metrics:
                            total_metric = "total_" + metric
                            evolution_item[total_metric] = evolution_item.get(
                                total_metric, 0)
                            evolution_item[total_metric] += file_details[
                                metric] if file_details[
                                    metric] is not None else 0

                    # TODO: Fix Logic: None rather than 1
                    evolution_item["total_comments_per_loc"] = round(
                        evolution_item["total_comments"] /
                        max(evolution_item["total_loc"], 1), 2)
                    evolution_item["total_blanks_per_loc"] = round(
                        evolution_item["total_blanks"] /
                        max(evolution_item["total_loc"], 1), 2)
                    evolution_item["total_loc_per_function"] = round(
                        evolution_item["total_loc"] /
                        max(evolution_item["total_num_funs"], 1), 2)

                    evolution_item.update(
                        self.get_grimoire_fields(
                            evolution_item["study_creation_date"], "stats"))
                    evolution_items.append(evolution_item)

                    if len(evolution_items) >= self.elastic.max_items_bulk:
                        num_items += len(evolution_items)
                        ins_items += es_out.bulk_upload(
                            evolution_items, self.get_field_unique_id())
                        evolution_items = []

                    to_month = to_month + relativedelta(months=+interval)

                if len(evolution_items) > 0:
                    num_items += len(evolution_items)
                    ins_items += es_out.bulk_upload(evolution_items,
                                                    self.get_field_unique_id())

                if num_items != ins_items:
                    missing = num_items - ins_items
                    logger.error(
                        "[cocom] study enrich-cocom-analysis {}/{} missing items for Graal CoCom Analysis "
                        "Study".format(missing, num_items))
                else:
                    logger.info(
                        "[cocom] study enrich-cocom-analysis {} items inserted for Graal CoCom Analysis "
                        "Study".format(num_items))

            logger.info(
                "[cocom] study enrich-cocom-analysis End analysis for {} with month interval"
                .format(repository_url))

        logger.info("[cocom] study enrich-cocom-analysis End")
コード例 #28
0
    def __create_arthur_json(self, repo, backend_args):
        """ Create the JSON for configuring arthur to collect data

        https://github.com/grimoirelab/arthur#adding-tasks
        Sample for git:

        {
        "tasks": [
            {
                "task_id": "arthur.git",
                "backend": "git",
                "backend_args": {
                    "gitpath": "/tmp/arthur_git/",
                    "uri": "https://github.com/grimoirelab/arthur.git"
                },
                "category": "commit",
                "archive_args": {
                    "archive_path": '/tmp/test_archives',
                    "fetch_from_archive": false,
                    "archive_after": None
                },
                "scheduler_args": {
                    "delay": 10
                }
            }
        ]
        }
        """

        backend_args = self._compose_arthur_params(self.backend_section, repo)
        if self.backend_section == 'git':
            backend_args['gitpath'] = os.path.join(self.REPOSITORY_DIR, repo)
        backend_args['tag'] = self.backend_tag(repo)

        ajson = {"tasks": [{}]}
        # This is the perceval tag
        ajson["tasks"][0]['task_id'] = self.backend_tag(repo)
        ajson["tasks"][0]['backend'] = self.backend_section.split(":")[0]
        ajson["tasks"][0]['backend_args'] = backend_args
        ajson["tasks"][0]['category'] = backend_args['category']
        ajson["tasks"][0]['archive'] = {}
        ajson["tasks"][0]['scheduler'] = {"delay": self.ARTHUR_TASK_DELAY}
        # from-date or offset param must be added
        es_col_url = self._get_collection_url()
        es_index = self.conf[self.backend_section]['raw_index']
        # Get the last activity for the data source
        es = ElasticSearch(es_col_url, es_index)
        connector = get_connector_from_name(self.backend_section)

        klass = connector[0]  # Backend for the connector
        signature = inspect.signature(klass.fetch)

        last_activity = None
        filter_ = {"name": "tag", "value": backend_args['tag']}
        if 'from_date' in signature.parameters:
            last_activity = es.get_last_item_field('metadata__updated_on',
                                                   [filter_])
            if last_activity:
                ajson["tasks"][0]['backend_args'][
                    'from_date'] = last_activity.isoformat()
        elif 'offset' in signature.parameters:
            last_activity = es.get_last_item_field('offset', [filter_])
            if last_activity:
                ajson["tasks"][0]['backend_args']['offset'] = last_activity

        if last_activity:
            logging.info("Getting raw item with arthur since %s",
                         last_activity)

        return (ajson)
コード例 #29
0
    def enrich_backlog_analysis(self,
                                ocean_backend,
                                enrich_backend,
                                no_incremental=False,
                                out_index="github_enrich_backlog",
                                date_field="grimoire_creation_date",
                                interval_days=1,
                                reduced_labels=["bug"],
                                map_label=["others", "bugs"]):
        """
        The purpose of this study is to add additional index to compute the
        chronological evolution of opened issues and average opened time issues.

        For each repository and label, we start the study on repository
        creation date until today with a day interval (default). For each date
        we retrieve the number of open issues at this date by difference between
        number of opened issues and number of closed issues. In addition, we
        compute the average opened time for all issues open at this date.

        To differentiate by label, we compute evolution for bugs and all others
        labels (like "enhancement","good first issue" ... ), we call this
        "reduced labels". We need to use theses reduced labels because the
        complexity to compute evolution for each combination of labels would be
        too big. In addition, we can rename "bug" label to "bugs" with map_label.

        Entry example in setup.cfg :

        [github]
        raw_index = github_issues_raw
        enriched_index = github_issues_enriched
        ...
        studies = [enrich_backlog_analysis]

        [enrich_backlog_analysis]
        out_index = github_enrich_backlog
        interval_days = 7
        reduced_labels = [bug,enhancement]
        map_label = [others, bugs, enhancements]

        """

        logger.info("[github] Start enrich_backlog_analysis study")

        # combine two lists to create the dict to map labels
        map_label = dict(zip([""] + reduced_labels, map_label))

        # connect to ES
        es_in = ES([enrich_backend.elastic_url],
                   retry_on_timeout=True,
                   timeout=100,
                   verify_certs=self.elastic.requests.verify,
                   connection_class=RequestsHttpConnection)
        in_index = enrich_backend.elastic.index

        # get all repositories
        unique_repos = es_in.search(
            index=in_index, body=get_unique_repository_with_project_name())
        repositories = [
            repo['key']
            for repo in unique_repos['aggregations']['unique_repos'].get(
                'buckets', [])
        ]

        logger.debug(
            "[enrich-backlog-analysis] {} repositories to process".format(
                len(repositories)))

        # create the index
        es_out = ElasticSearch(enrich_backend.elastic.url,
                               out_index,
                               mappings=Mapping)
        es_out.add_alias("backlog_study")

        # analysis for each repositories
        num_items = 0
        ins_items = 0
        for repository in repositories:
            repository_url = repository["origin"]
            project = repository["project"]
            org_name = repository["organization"]
            repository_name = repository_url.split("/")[-1]

            logger.debug(
                "[enrich-backlog-analysis] Start analysis for {}".format(
                    repository_url))

            # get each day since repository creation
            dates = es_in.search(index=in_index,
                                 body=get_issues_dates(interval_days,
                                                       repository_url)
                                 )['aggregations']['created_per_interval'].get(
                                     "buckets", [])

            # for each selected label + others labels
            for label, other in [("", True)] + [(l, False)
                                                for l in reduced_labels]:
                # compute metrics for each day (ES request for each day)
                evolution_items = []
                for date in map(lambda i: i['key_as_string'], dates):
                    evolution_item = self.__create_backlog_item(
                        repository_url, repository_name, project, date,
                        org_name, interval_days, label, map_label,
                        self.__get_opened_issues(es_in, in_index,
                                                 repository_url, date,
                                                 interval_days, other, label,
                                                 reduced_labels))
                    evolution_items.append(evolution_item)

                # complete until today (no ES request needed, just extrapol)
                today = datetime.now().replace(hour=0,
                                               minute=0,
                                               second=0,
                                               tzinfo=None)
                last_item = evolution_item
                last_date = str_to_datetime(
                    evolution_item['study_creation_date']).replace(tzinfo=None) \
                    + relativedelta(days=interval_days)
                average_opened_time = evolution_item['average_opened_time'] \
                    + float(interval_days)
                while last_date < today:
                    date = last_date.strftime('%Y-%m-%dT%H:%M:%S.000Z')
                    evolution_item = {}
                    evolution_item.update(last_item)
                    evolution_item.update({
                        "average_opened_time":
                        average_opened_time,
                        "study_creation_date":
                        date,
                        "uuid":
                        "{}_{}_{}".format(date, repository_name, label),
                    })
                    evolution_item.update(
                        self.get_grimoire_fields(date, "stats"))
                    evolution_items.append(evolution_item)
                    last_date = last_date + relativedelta(days=interval_days)
                    average_opened_time = average_opened_time + float(
                        interval_days)

                # upload items to ES
                if len(evolution_items) > 0:
                    num_items += len(evolution_items)
                    ins_items += es_out.bulk_upload(evolution_items,
                                                    self.get_field_unique_id())

                if num_items != ins_items:
                    missing = num_items - ins_items
                    logger.error(
                        ("[enrich-backlog-analysis] %s/%s missing items",
                         "for Graal Backlog Analysis Study"), str(missing),
                        str(num_items))
                else:
                    logger.debug(
                        ("[enrich-backlog-analysis] %s items inserted",
                         "for Graal Backlog Analysis Study"), str(num_items))

        logger.info("[github] End enrich_backlog_analysis study")
コード例 #30
0
    connector = get_connector_from_name(backend_name, connectors)
    backend = connector[0](**vars(args))
    ocean_backend = connector[1](backend, **vars(args))
    enrich_backend = connector[2](backend, **vars(args))

    es_index = backend.get_name() + "_" + backend.get_id()

    clean = args.no_incremental

    if args.cache:
        clean = True

    try:
        # Ocean
        elastic_state = ElasticSearch(args.elastic_url, es_index,
                                      ocean_backend.get_elastic_mappings(),
                                      clean)

        # Enriched ocean
        enrich_index = es_index + "_enrich"
        elastic = ElasticSearch(args.elastic_url, enrich_index,
                                enrich_backend.get_elastic_mappings(), clean)

    except ElasticConnectException:
        logging.error("Can't connect to Elastic Search. Is it running?")
        sys.exit(1)

    ocean_backend.set_elastic(elastic_state)
    enrich_backend.set_elastic(elastic)

    try: