Example #1
0
    def __init__(self):
        """Init method for the Report helper class."""
        self.s3 = S3Helper()
        self.pg = Postgres()
        self.conn = self.pg.conn
        self.cursor = self.pg.cursor
        self.unknown_deps_helper = UnknownDepsReportHelper()
        self.sentry_helper = SentryReportHelper()
        self.npm_model_bucket = os.getenv('NPM_MODEL_BUCKET', 'cvae-insights')
        self.maven_model_bucket = os.getenv('MAVEN_MODEL_BUCKET',
                                            'hpf-insights')
        self.pypi_model_bucket = os.getenv('PYPI_MODEL_BUCKET', 'hpf-insights')
        self.golang_model_bucket = os.getenv('GOLANG_MODEL_BUCKET',
                                             'golang-insights')
        self.maven_training_repo = os.getenv(
            'MAVEN_TRAINING_REPO',
            'https://github.com/fabric8-analytics/f8a-hpf-insights')
        self.npm_training_repo = os.getenv(
            'NPM_TRAINING_REPO',
            'https://github.com/fabric8-analytics/fabric8-analytics-npm-insights'
        )
        self.golang_training_repo = os.getenv(
            'GOLANG_TRAINING_REPO',
            'https://github.com/fabric8-analytics/f8a-golang-insights')
        self.pypi_training_repo = os.getenv(
            'PYPI_TRAINING_REPO',
            'https://github.com/fabric8-analytics/f8a-pypi-insights')

        self.emr_api = os.getenv('EMR_API', 'http://f8a-emr-deployment:6006')
class ReportHelper:
    """Stack Analyses report helper functions."""

    def __init__(self):
        """Init method for the Report helper class."""
        self.s3 = S3Helper()
        self.pg = Postgres()
        self.conn = self.pg.conn
        self.cursor = self.pg.cursor
        self.unknown_deps_helper = UnknownDepsReportHelper()
        self.sentry_helper = SentryReportHelper()
        self.npm_model_bucket = os.getenv('NPM_MODEL_BUCKET')
        self.maven_model_bucket = os.getenv('MAVEN_MODEL_BUCKET')
        self.pypi_model_bucket = os.getenv('PYPI_MODEL_BUCKET')
        self.golang_model_bucket = os.getenv('GOLANG_MODEL_BUCKET')
        self.maven_training_repo = os.getenv(
            'MAVEN_TRAINING_REPO', 'https://github.com/fabric8-analytics/f8a-hpf-insights')
        self.npm_training_repo = os.getenv(
            'NPM_TRAINING_REPO',
            'https://github.com/fabric8-analytics/fabric8-analytics-npm-insights')
        self.golang_training_repo = os.getenv(
            'GOLANG_TRAINING_REPO', 'https://github.com/fabric8-analytics/f8a-golang-insights')
        self.pypi_training_repo = os.getenv(
            'PYPI_TRAINING_REPO', 'https://github.com/fabric8-analytics/f8a-pypi-insights')

        self.emr_api = os.getenv('EMR_API', 'http://f8a-emr-deployment:6006')

    def cleanup_db_tables(self):
        """Cleanup RDS data tables on a periodic basis."""
        try:
            # Number of days to retain the celery task_meta data
            num_days_metadata = os.environ.get('KEEP_DB_META_NUM_DAYS', '7')
            # query to delete the celery task_meta data
            query = sql.SQL('DELETE FROM celery_taskmeta '
                            'WHERE DATE_DONE <= NOW() - interval \'%s day\';')
            logger.info('Starting to clean up Celery Meta tables')
            # Execute the query
            self.cursor.execute(query.as_string(self.conn) % (num_days_metadata))
            # Log the message returned from db cursor
            logger.info('%r' % self.cursor.statusmessage)
            logger.info('Cleanup of Celery Meta tables complete')

            # Number of days to retain the celery woker_result data
            num_days_workerdata = os.environ.get('KEEP_WORKER_RESULT_NUM_DAYS', '60')
            # query to delete the worker_result data
            query = sql.SQL('DELETE FROM worker_results '
                            'WHERE ended_at <= NOW() - interval \'%s day\';')
            logger.info('Starting to clean up Worker Result data tables')
            # Execute the query
            self.cursor.execute(query.as_string(self.conn) % (num_days_workerdata))
            # Log the message returned from db cursor
            logger.info('%r' % self.cursor.statusmessage)
            logger.info('Cleanup of Worker Result data tables complete')
        except Exception as e:
            logger.error('CleanupDatabaseError: %r' % e)

    def validate_and_process_date(self, some_date):
        """Validate the date format and apply the format YYYY-MM-DDTHH:MI:SSZ."""
        try:
            dt.strptime(some_date, '%Y-%m-%d')
        except ValueError:
            raise ValueError("Incorrect data format, should be YYYY-MM-DD")
        return some_date

    def retrieve_stack_analyses_ids(self, start_date, end_date):
        """Retrieve results for stack analyses requests."""
        try:
            # start_date from which data is to be fetched
            start_date = self.validate_and_process_date(start_date)
            # end_date up to which data is to be fetched
            end_date = self.validate_and_process_date(end_date)
        except ValueError:
            # checks Invalid date format
            raise ValueError("Invalid date format")
        # Query to fetch Stack Analysis Ids from start_date to end_date
        query = sql.SQL('SELECT {} FROM {} WHERE {} BETWEEN \'%s\' AND \'%s\'').format(
            sql.Identifier('id'),
            sql.Identifier('stack_analyses_request'),
            sql.Identifier('submitTime')
        )
        # Executing Query
        self.cursor.execute(query.as_string(self.conn) % (start_date, end_date))
        # Fetching all results
        rows = self.cursor.fetchall()
        # Appending all the stack-ids in a list
        id_list = []
        for row in rows:
            for col in row:
                id_list.append(col)

        return id_list

    @staticmethod
    def get_time_delta(start_date, end_date):
        """Get Timedelta object."""
        return dt.strptime(end_date, '%Y-%m-%d') - dt.strptime(start_date, '%Y-%m-%d')

    def retrieve_stack_analyses_content(self, start_date, end_date):
        """Retrieve results for stack analyses requests."""
        try:
            # start_date from which data is to be fetched
            start_date = self.validate_and_process_date(start_date)
            # end_date up to which data is to be fetched
            end_date = self.validate_and_process_date(end_date)
        except ValueError:
            # checks Invalid date format
            raise ValueError("Invalid date format")

        # Query to fetch Stack Analysis manifests data from start_date to end_date
        query = sql.SQL('SELECT {} FROM {} WHERE {} BETWEEN \'%s\' AND \'%s\'').format(
            sql.Identifier('requestJson'), sql.Identifier('stack_analyses_request'),
            sql.Identifier('submitTime')
        )
        # Executing Query
        self.cursor.execute(query.as_string(self.conn) % (start_date, end_date))
        # Fetching all results
        return self.cursor.fetchall()

    def flatten_list(self, alist):
        """Convert a list of lists to a single list."""
        return list(itertools.chain.from_iterable(alist))

    def datediff_in_millisecs(self, start_date, end_date):
        """Return the difference of two datetime strings in milliseconds."""
        format = '%Y-%m-%dT%H:%M:%S.%f'
        return (dt.strptime(end_date, format) -
                dt.strptime(start_date, format)).microseconds / 1000

    def populate_key_count(self, in_list=[]):
        """Generate a dict with the frequency of list elements."""
        out_dict = {}
        try:
            for item in in_list:
                if type(item) == dict:
                    logger.error('Unexpected key encountered %r' % item)
                    continue

                if item in out_dict:
                    out_dict[item] += 1
                else:
                    out_dict[item] = 1
        except (IndexError, KeyError, TypeError) as e:
            logger.exception('Error: %r' % e)
            return {}
        return out_dict

    def set_unique_stack_deps_count(self, unique_stacks_with_recurrence_count):
        """Set the dependencies count against the identified unique stacks."""
        out_dict = {}
        for key in unique_stacks_with_recurrence_count.items():
            new_dict = {}
            for stack in key[1].items():
                new_dict[stack[0]] = len(stack[0].split(','))
            out_dict[key[0]] = new_dict
        return out_dict

    def normalize_deps_list(self, deps):
        """Flatten the dependencies dict into a list."""
        normalized_list = []
        for dep in deps:
            normalized_list.append('{package} {version}'.format(package=dep['package'],
                                                                version=dep['version']))
        return sorted(normalized_list)

    def collate_raw_data(self, unique_stacks_with_recurrence_count, frequency):
        """Collate previous raw data with this week/month data."""
        result = {}

        # Get collated user input data
        collated_user_input_obj_key = 'user-input-data/collated-{freq}.json'.format(freq=frequency)
        collated_user_input = self.s3.read_json_object(bucket_name=self.s3.report_bucket_name,
                                                       obj_key=collated_user_input_obj_key) or {}

        for eco in unique_stacks_with_recurrence_count.keys() | collated_user_input.keys():
            result.update({eco: {
                "user_input_stack": dict(
                            Counter(unique_stacks_with_recurrence_count.get(eco)) +
                            Counter(collated_user_input.get(eco, {}).get('user_input_stack')))
            }})

        # Store user input collated data back to S3
        self.s3.store_json_content(content=result, bucket_name=self.s3.report_bucket_name,
                                   obj_key=collated_user_input_obj_key)

        # Get collated big query data
        collated_big_query_obj_key = 'big-query-data/collated.json'
        collated_big_query_data = self.s3.read_json_object(bucket_name=self.s3.report_bucket_name,
                                                           obj_key=collated_big_query_obj_key) or {}

        for eco in collated_big_query_data.keys():
            if result.get(eco):
                result[eco]["bigquery_data"] = collated_big_query_data.get(eco)
            else:
                result[eco] = {"bigquery_data": collated_big_query_data.get(eco)}
        return result

    def invoke_emr_api(self, bucket_name, ecosystem, data_version, github_repo):
        """Invoke EMR Retraining API to start the retraining process."""
        payload = {
            'bucket_name': bucket_name,
            'github_repo': github_repo,
            'ecosystem': ecosystem,
            'data_version': data_version
        }

        logger.info('bucket_name for {eco}: {buck}'.format(eco=ecosystem, buck=bucket_name))
        logger.info('github_repo for {eco}: {git}'.format(eco=ecosystem, git=github_repo))
        logger.info('data_version for {eco}: {data}'.format(eco=ecosystem, data=data_version))

        try:
            # Invoke EMR API to run the retraining
            resp = requests.post(url=self.emr_api + '/api/v1/runjob', json=payload)
            logger.info(resp.json())
            # Check for status code
            # If status is not success, log it as an error
            if resp.status_code == 200:
                logger.info('Successfully invoked EMR API for {eco} ecosystem \n {resp}'.format(
                    eco=ecosystem, resp=resp.json()))
            else:
                logger.error('Error received from EMR API for {eco} ecosystem \n {resp}'.format(
                    eco=ecosystem, resp=resp.json()))
        except Exception as e:
            logger.error('Failed to invoke EMR API for {eco}, error: %r'.format(eco=ecosystem) % e)

    def get_training_data_for_ecosystem(self, eco, stack_dict):
        """Get Training data for an ecosystem."""
        unique_stacks = {}
        package_dict_for_eco = {
            "user_input_stack": [],
            "bigquery_data": []
        }
        for stack_type, stacks in stack_dict.items():
            for package_string in stacks:
                package_list = [x.strip().split(' ')[0]
                                for x in package_string.split(',')]
                stack_str = "".join(package_list)
                if stack_str not in unique_stacks:
                    unique_stacks[stack_str] = 1
                    package_dict_for_eco.get(stack_type).append(package_list)

        training_data = {
            'ecosystem': eco,
            'package_dict': package_dict_for_eco
        }

        return training_data

    def store_training_data(self, result):
        """Store Training Data for each ecosystem in their respective buckets."""
        model_version = dt.now().strftime('%Y-%m-%d')

        for eco, stack_dict in result.items():
            training_data = self.get_training_data_for_ecosystem(eco, stack_dict)
            obj_key = '{model_version}/data/manifest.json'.format(model_version=model_version)

            # Get the bucket name based on ecosystems to store user-input stacks for retraining
            if eco == 'maven':
                bucket_name = self.maven_model_bucket
                github_repo = self.maven_training_repo
                logger.info('maven bucket name is: {bucket}'.format(bucket=bucket_name))
            elif eco == 'pypi':
                bucket_name = self.pypi_model_bucket
                github_repo = self.pypi_training_repo
                logger.info('pypi bucket name is: {bucket}'.format(bucket=bucket_name))
            elif eco == 'go':
                bucket_name = self.golang_model_bucket
                github_repo = self.golang_training_repo
                logger.info('go bucket name is: {bucket}'.format(bucket=bucket_name))
            elif eco == 'npm':
                bucket_name = self.npm_model_bucket
                github_repo = self.npm_training_repo
                logger.info('npm bucket name is: {bucket}'.format(bucket=bucket_name))
            else:
                continue

            if bucket_name:
                logger.info('Storing user-input stacks for ecosystem {eco} at {dir}'.format(
                    eco=eco, dir=bucket_name + obj_key))
                try:
                    # Store the training content for each ecosystem
                    self.s3.store_json_content(content=training_data, bucket_name=bucket_name,
                                               obj_key=obj_key)
                    # Invoke the EMR API to kickstart retraining process
                    # This EMR invocation happens for all ecosystems almost at the same time.
                    # TODO - find an alternative if there is a need
                    self.invoke_emr_api(bucket_name, eco, model_version, github_repo)
                except Exception as e:
                    logger.error('Unable to invoke EMR API. Reason: %r' % e)

    def get_trending(self, mydict, top_trending_count=3):
        """Generate the top trending items list."""
        return (dict(heapq.nlargest(top_trending_count, mydict.items(), key=itemgetter(1))))

    def get_ecosystem_summary(self, ecosystem, total_stack_requests, all_deps, all_unknown_deps,
                              unique_stacks_with_recurrence_count, unique_stacks_with_deps_count,
                              avg_response_time, unknown_deps_ingestion_report):
        """Generate ecosystem specific stack summary."""
        unique_dep_frequency = self.populate_key_count(self.flatten_list(all_deps[ecosystem]))
        rectify_latest_version(unique_dep_frequency, ecosystem, True)
        return {
            'stack_requests_count': total_stack_requests[ecosystem],
            'unique_dependencies_with_frequency':
            self.populate_key_count(self.flatten_list(all_deps[ecosystem])),
            'unique_unknown_dependencies_with_frequency': unique_dep_frequency,
            'unique_stacks_with_frequency': unique_stacks_with_recurrence_count[ecosystem],
            'unique_stacks_with_deps_count': unique_stacks_with_deps_count[ecosystem],
            'average_response_time': '{} ms'.format(avg_response_time[ecosystem]),
            'trending': {
                'top_stacks':
                    self.get_trending(unique_stacks_with_recurrence_count[ecosystem], 3),
                'top_deps': self.get_trending(
                    self.populate_key_count(self.flatten_list(all_deps[ecosystem])), 5),
            },
            'previously_unknown_dependencies': unknown_deps_ingestion_report[ecosystem]
        }

    def save_result(self, frequency, report_name, template):
        """Save result in S3 bucket."""
        try:
            obj_key = '{freq}/{report_name}.json'.format(
                freq=frequency, report_name=report_name
            )
            self.s3.store_json_content(content=template, obj_key=obj_key,
                                       bucket_name=self.s3.report_bucket_name)
        except Exception as e:
            logger.exception('Unable to store the report on S3. Reason: %r' % e)

    def get_report_name(self, frequency, end_date):
        """Create a report name."""
        if frequency == 'monthly':
            return dt.strptime(end_date, '%Y-%m-%d').strftime('%Y-%m')
        else:
            return dt.strptime(end_date, '%Y-%m-%d').strftime('%Y-%m-%d')

    def collate_and_retrain(self, unique_stacks, frequency='weekly'):
        """Append stacks to 'collated-weekly' and re-train models."""
        # Append last week's data to 'collated-weekly'; returns 'BQ+collated' data
        collated_data = self.collate_raw_data(unique_stacks, frequency)
        # Store ecosystem specific manifest.json and re-trains models
        self.store_training_data(collated_data)

    def create_venus_report(self, venus_input):
        """Create venus report."""
        # Retrieve input variables
        frequency = venus_input[0]
        report_name = venus_input[1]
        template = venus_input[2]

        self.save_result(frequency, report_name, template)
        return template

    def normalize_worker_data(self, start_date, end_date, stack_data, worker,
                              frequency='daily', retrain=False):
        """Normalize worker data for reporting."""
        total_stack_requests = {'all': 0, 'npm': 0, 'maven': 0, 'pypi': 0}

        report_name = self.get_report_name(frequency, end_date)

        # Prepare the template
        stack_data = json.loads(stack_data)
        template = {
            'report': {
                'from': start_date,
                'to': end_date,
                'generated_on': dt.now().isoformat('T')
            },
            'stacks_summary': {},
            'stacks_details': []
        }
        all_deps = {'npm': [], 'maven': [], 'pypi': []}
        all_unknown_deps = {'npm': [], 'maven': [], 'pypi': []}
        all_unknown_lic = []
        all_cve_list = []

        # Process the response
        total_response_time = {'all': 0.0, 'npm': 0.0, 'maven': 0.0, 'pypi': 0.0}
        if worker == 'stack_aggregator_v2':
            stacks_list = {'npm': [], 'maven': [], 'pypi': []}
            for data in stack_data:
                stack_info_template = {
                    'ecosystem': '',
                    'stack': [],
                    'unknown_dependencies': [],
                    'license': {
                        'conflict': False,
                        'unknown': []
                    },
                    'security': {
                        'cve_list': [],
                    },
                    'response_time': ''
                }
                try:
                    user_stack_info = data[0]['stack_data'][0]['user_stack_info']
                    if len(user_stack_info['dependencies']) == 0:
                        continue

                    stack_info_template['ecosystem'] = user_stack_info['ecosystem']
                    total_stack_requests['all'] += 1
                    total_stack_requests[stack_info_template['ecosystem']] += 1

                    stack_info_template['stack'] = self.normalize_deps_list(
                        user_stack_info['dependencies'])
                    all_deps[user_stack_info['ecosystem']].append(stack_info_template['stack'])
                    stack_str = ','.join(stack_info_template['stack'])
                    stacks_list[user_stack_info['ecosystem']].append(stack_str)

                    unknown_dependencies = []
                    for dep in user_stack_info['unknown_dependencies']:
                        dep['package'] = dep.pop('name')
                        unknown_dependencies.append(dep)
                    stack_info_template['unknown_dependencies'] = self.normalize_deps_list(
                        unknown_dependencies)
                    all_unknown_deps[user_stack_info['ecosystem']].\
                        append(stack_info_template['unknown_dependencies'])

                    stack_info_template['license']['unknown'] = \
                        user_stack_info['license_analysis']['unknown_licenses']['really_unknown']
                    all_unknown_lic.append(stack_info_template['license']['unknown'])

                    for pkg in user_stack_info['analyzed_dependencies']:
                        for cve in pkg['security']:
                            stack_info_template['security']['cve_list'].append(cve)
                            all_cve_list.append('{cve}:{cvss}'.
                                                format(cve=cve['CVE'], cvss=cve['CVSS']))

                    ended_at, started_at = \
                        data[0]['_audit']['ended_at'], data[0]['_audit']['started_at']

                    response_time = self.datediff_in_millisecs(started_at, ended_at)
                    stack_info_template['response_time'] = '%f ms' % response_time
                    total_response_time['all'] += response_time
                    total_response_time[stack_info_template['ecosystem']] += response_time
                    template['stacks_details'].append(stack_info_template)
                except (IndexError, KeyError, TypeError) as e:
                    logger.exception('Error: %r' % e)
                    continue

            unique_stacks_with_recurrence_count = {
                'npm': self.populate_key_count(stacks_list['npm']),
                'maven': self.populate_key_count(stacks_list['maven']),
                'pypi': self.populate_key_count(stacks_list['pypi'])
            }

            unique_stacks_with_deps_count = \
                self.set_unique_stack_deps_count(unique_stacks_with_recurrence_count)

            avg_response_time = {}
            if total_stack_requests['npm'] > 0:
                avg_response_time['npm'] = total_response_time['npm'] / total_stack_requests['npm']
            else:
                avg_response_time['npm'] = 0

            if total_stack_requests['maven'] > 0:
                avg_response_time['maven'] = \
                    total_response_time['maven'] / total_stack_requests['maven']
            else:
                avg_response_time['maven'] = 0

            if total_stack_requests['pypi'] > 0:
                avg_response_time['pypi'] = \
                    total_response_time['pypi'] / total_stack_requests['pypi']
            else:
                avg_response_time['pypi'] = 0

            # Get a list of unknown licenses
            unknown_licenses = []
            for lic_dict in self.flatten_list(all_unknown_lic):
                if 'license' in lic_dict:
                    unknown_licenses.append(lic_dict['license'])

            unknown_deps_ingestion_report = self.unknown_deps_helper.get_current_ingestion_status()

            # generate aggregated data section
            template['stacks_summary'] = {
                'total_stack_requests_count': total_stack_requests['all'],
                'npm': self.get_ecosystem_summary('npm', total_stack_requests, all_deps,
                                                  all_unknown_deps,
                                                  unique_stacks_with_recurrence_count,
                                                  unique_stacks_with_deps_count,
                                                  avg_response_time,
                                                  unknown_deps_ingestion_report),
                'maven': self.get_ecosystem_summary('maven', total_stack_requests, all_deps,
                                                    all_unknown_deps,
                                                    unique_stacks_with_recurrence_count,
                                                    unique_stacks_with_deps_count,
                                                    avg_response_time,
                                                    unknown_deps_ingestion_report),
                'pypi': self.get_ecosystem_summary('pypi', total_stack_requests, all_deps,
                                                   all_unknown_deps,
                                                   unique_stacks_with_recurrence_count,
                                                   unique_stacks_with_deps_count,
                                                   avg_response_time,
                                                   unknown_deps_ingestion_report),
                'unique_unknown_licenses_with_frequency':
                    self.populate_key_count(unknown_licenses),
                'unique_cves':
                    self.populate_key_count(all_cve_list),
                'total_average_response_time':
                    '{} ms'.format(total_response_time['all'] / len(template['stacks_details'])),
                'cve_report': CVE().generate_cve_report(updated_on=start_date)
            }

            # monthly data collection on the 1st of every month
            if frequency == 'monthly':
                self.collate_raw_data(unique_stacks_with_recurrence_count, 'monthly')

            # return data to re-train models or generate venus report
            if retrain is True:
                return unique_stacks_with_recurrence_count
            else:
                venus_input = [frequency, report_name, template]
                return venus_input
        else:
            # todo: user feedback aggregation based on the recommendation task results
            return None

    def retrieve_worker_results(self, start_date, end_date, id_list=[], worker_list=[],
                                frequency='daily', retrain=False):
        """Retrieve results for selected worker from RDB."""
        result_interim = {}
        # convert the elements of the id_list to sql.Literal
        # so that the SQL query statement contains the IDs within quotes
        id_list = list(map(sql.Literal, id_list))
        ids = sql.SQL(', ').join(id_list).as_string(self.conn)

        for worker in worker_list:
            query = sql.SQL('SELECT {} FROM {} WHERE {} IN (%s) AND {} = \'%s\'').format(
                sql.Identifier('task_result'), sql.Identifier('worker_results'),
                sql.Identifier('external_request_id'), sql.Identifier('worker')
            )

            self.cursor.execute(query.as_string(self.conn) % (ids, worker))
            data = json.dumps(self.cursor.fetchall())

            if retrain is True:
                unique_stacks = self.normalize_worker_data(start_date, end_date,
                                                           data, worker, frequency, retrain)
                return unique_stacks
            else:
                # associate the retrieved data to the worker name
                result_interim[worker] = \
                    self.normalize_worker_data(start_date, end_date, data,
                                               worker, frequency, retrain)
        return result_interim

    def retrieve_ingestion_results(self, start_date, end_date, frequency='daily'):
        """Retrieve results for selected worker from RDB."""
        logger.info('Retrieve ingestion results.')
        result = {}

        # Query to fetch the EPVs that were ingested on a particular day

        query = sql.SQL('SELECT EC.NAME, PK.NAME, VR.IDENTIFIER FROM ANALYSES AN,'
                        ' PACKAGES PK, VERSIONS VR, ECOSYSTEMS EC WHERE'
                        ' AN.STARTED_AT >= \'%s\' AND AN.STARTED_AT < \'%s\''
                        ' AND AN.VERSION_ID = VR.ID AND VR.PACKAGE_ID = PK.ID'
                        ' AND PK.ECOSYSTEM_ID = EC.ID')

        self.cursor.execute(query.as_string(self.conn) % (start_date, end_date))
        data = json.dumps(self.cursor.fetchall())
        result['EPV_DATA'] = data
        return self.normalize_ingestion_data(start_date, end_date, result, frequency)

    def generate_results(self, epvs, template, pkg_output, ver_output):
        """Get package information from graph."""
        template['ingestion_summary']['incorrect_latest_version'] = {}
        template['ingestion_summary']['unknown_deps'] = {}
        count = {}
        latest_epvs = []
        checked_pkgs = []
        for epv in epvs:
            eco = epv['ecosystem']
            pkg = epv['name']
            ver = epv['version']

            # Add parameters to count the different params
            if eco not in count:
                count[eco] = {
                    'incorrect_latest_versions': 0,
                    'correct_latest_versions': 0,
                    'ingested_in_graph': 0,
                    'not_ingested_in_graph': 0
                }
            if eco not in template['ingestion_summary']['incorrect_latest_version']:
                template['ingestion_summary']['incorrect_latest_version'][eco] = []
                template['ingestion_summary']['unknown_deps'][eco] = []
            pkg_data = pkg_output[eco + "@DELIM@" + pkg]
            ver_data = ver_output[eco + "@DELIM@" + pkg + "@DELIM@" + ver]
            actual_latest_ver = pkg_data['actual_latest_version']

            # check if the package is publicly available
            if actual_latest_ver:
                known_latest_ver = pkg_data['known_latest_version']
                if actual_latest_ver != known_latest_ver and (eco + "@DELIM@" + pkg
                                                              not in checked_pkgs):
                    checked_pkgs.append(eco + "@DELIM@" + pkg)
                    tmp = {
                        "package": pkg,
                        "actual_latest_version": actual_latest_ver,
                        "known_latest_version": known_latest_ver
                    }
                    template['ingestion_summary']['incorrect_latest_version'][eco].append(tmp)
                    count[eco]['incorrect_latest_versions'] += 1

                template['ingestion_details'][eco][pkg]['known_latest_version'] \
                    = known_latest_ver
                template['ingestion_details'][eco][pkg]['actual_latest_version'] \
                    = actual_latest_ver
                non_cve_version = pkg_data.get('non_cve_version', '')
                if non_cve_version:
                    template['ingestion_details'][eco][pkg]['non_cve_version'] \
                        = non_cve_version
                latest_json = {
                    "ecosystem": eco,
                    "name": pkg,
                    "version": actual_latest_ver
                }
                latest_epvs.append(latest_json)

                # Count the correct latest version EPVs
                if actual_latest_ver == known_latest_ver:
                    count[eco]['correct_latest_versions'] += 1

                # Add to report if the EPV exist in the graph or not
                template['ingestion_details'][eco][pkg][ver]['synced_to_graph'] = ver_data
                if ver_data == "false":
                    template['ingestion_summary']['unknown_deps'][eco].append(epv)
                    count[eco]['not_ingested_in_graph'] += 1
                else:
                    count[eco]['ingested_in_graph'] += 1
            else:
                # Mark the package as private as the information is not present publicly
                template['ingestion_details'][eco][pkg]['private_pkg'] = "true"

        # For each ecosystem, calculate the %age accuracy
        for eco in count:
            correct = count[eco]['correct_latest_versions']
            incorrect = count[eco]['incorrect_latest_versions']
            # Calculate the %age of latest version accuracy
            if correct != 0 or incorrect != 0:
                count[eco]['latest_version_accuracy'] = round(((correct * 100) /
                                                               (correct + incorrect)), 2)

            correct = count[eco]['ingested_in_graph']
            incorrect = count[eco]['not_ingested_in_graph']
            # Calculate the %age of successful ingestion
            if correct != 0 or incorrect != 0:
                count[eco]['ingestion_accuracy'] = round(((correct * 100) /
                                                          (correct + incorrect)), 2)

            # Rectify the latest versions only if present
            if count[eco]['incorrect_latest_versions'] != 0:
                summary = template['ingestion_summary']
                logger.info("Information related to incorrect latest version --")
                logger.info(summary['incorrect_latest_version'][eco])
                rectify_latest_version(summary['incorrect_latest_version'][eco], eco)
        template['ingestion_summary']['stats'] = count
        return template, latest_epvs

    def check_latest_node(self, latest_epvs, template):
        """Get if latest node is present in graph."""
        graph_output = generate_report_for_unknown_epvs(latest_epvs)
        missing_latest = {}
        for epv in latest_epvs:
            eco = epv['ecosystem']
            pkg = epv['name']
            ver = epv['version']
            output = graph_output[eco + "@DELIM@" + pkg + "@DELIM@" + ver]
            template['ingestion_details'][eco][pkg]['latest_node_in_graph'] = output

            # If the EPV is missing in graph, add it to the summary
            if output == "false":
                if eco not in missing_latest:
                    missing_latest[eco] = []
                tmp = {
                    "package": pkg,
                    "version": ver
                }
                missing_latest[eco].append(tmp)
        template['ingestion_summary']['missing_latest_node'] = missing_latest
        return template

    def populate_default_information(self, epv_data, template):
        """To populate the default information in the template."""
        epvs = []
        ing_details = {}
        for epv in epv_data:
            eco = epv[0]
            pkg = epv[1]
            ver = epv[2]
            epv_template = {
                'ecosystem': eco,
                'name': pkg,
                'version': ver
            }
            epvs.append(epv_template)

            # Add eco key in json if missing
            if eco not in ing_details:
                ing_details[eco] = {}

            # Add pkg key in json if missing
            if pkg not in ing_details[eco]:
                ing_details[eco][pkg] = {}

            # Add version key in json if missing
            if ver not in ing_details[eco][pkg]:
                ing_details[eco][pkg][ver] = {}

        # Add the EPV information to the template
        template['ingestion_details'] = ing_details
        return template, epvs

    def normalize_ingestion_data(self, start_date, end_date, ingestion_data, frequency='daily'):
        """Normalize worker data for reporting."""
        logger.info("Normalize Ingestion Data started")
        report_type = 'ingestion-data'
        if frequency == 'monthly':
            report_name = dt.strptime(end_date, '%Y-%m-%d').strftime('%Y-%m')
        else:
            report_name = dt.strptime(end_date, '%Y-%m-%d').strftime('%Y-%m-%d')

        template = {
            'report': {
                'from': start_date,
                'to': end_date,
                'generated_on': dt.now().isoformat('T')
            },
            'ingestion_summary': {},
            'ingestion_details': {}
        }

        epv_data = ingestion_data['EPV_DATA']
        epv_data = json.loads(epv_data)

        # Populate the default template with EPV info
        template, epvs = self.populate_default_information(epv_data, template)

        logger.info("Fetching details of the latest version for the epvs")
        today = dt.today()
        pkg_output = generate_report_for_latest_version(epvs, today)
        logger.info("Fetching details of the unknown packages for the epvs")
        ver_output = generate_report_for_unknown_epvs(epvs)

        # Call the function to add the package information to the template
        template, latest_epvs = self.generate_results(epvs, template, pkg_output, ver_output)

        # Call the function to get the availability of latest node
        logger.info("Checking if latest node exists in graph")
        template = self.check_latest_node(latest_epvs, template)

        # Saving the final report in the relevant S3 bucket
        try:
            obj_key = '{type}/epv/{report_name}.json'.format(
                type=report_type, report_name=report_name
            )
            self.s3.store_json_content(content=template, obj_key=obj_key,
                                       bucket_name=self.s3.report_bucket_name)
        except Exception as e:
            logger.exception('Unable to store the report on S3. Reason: %r' % e)
        return template

    def get_report(self, start_date, end_date, frequency='daily', retrain=False):
        """Generate the stacks report."""
        logger.info("Get Report Executed.")
        ids = self.retrieve_stack_analyses_ids(start_date, end_date)
        worker_list = ['stack_aggregator_v2']
        ingestion_results = False

        if frequency == 'daily':
            start = datetime.datetime.now()
            result = self.retrieve_ingestion_results(start_date, end_date)
            elapsed_seconds = (datetime.datetime.now() - start).total_seconds()
            logger.info(
                "It took {t} seconds to generate ingestion report.".format(
                    t=elapsed_seconds))
            if result['ingestion_details'] != {}:
                ingestion_results = True
            else:
                ingestion_results = False

            result = self.sentry_helper.retrieve_sentry_logs(start_date, end_date)
            if not result:
                logger.error('No Sentry Error Logs found in last 24 hours')
        if len(ids) > 0:
            logger.info('stack analyses data exists.')
            result_interim = self.retrieve_worker_results(
                start_date, end_date, ids, worker_list, frequency, retrain)

            # generate result for each worker
            worker_result = {}
            for worker in worker_list:
                if worker == 'stack_aggregator_v2':
                    worker_result[worker] = self.create_venus_report(result_interim[worker])
                # can add results for more workers later

            return worker_result, ingestion_results
        else:
            logger.error('No stack analyses found from {s} to {e} to generate an aggregated report'
                         .format(s=start_date, e=end_date))
            return False, ingestion_results

    def re_train(self, start_date, end_date, frequency='weekly', retrain=True):
        """Re-trains models for all ecosystems."""
        logger.info('retraining triggered.')
        ids = self.retrieve_stack_analyses_ids(start_date, end_date)
        if len(ids) > 0:
            unique_stacks = self.retrieve_worker_results(
                start_date, end_date, ids, ['stack_aggregator_v2'], frequency, retrain)
            # collate stacks and re-train models for all ecosystems
            self.collate_and_retrain(unique_stacks, frequency)

        else:
            logger.error('No stack analyses found from {s} to {e} to re-train models'
                         .format(s=start_date, e=end_date))