def run(self, since=None, until=None): """ Run the dist-git scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load of dist-git commits') if since is None: start_date = self.default_since else: start_date = timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = timestamp_to_date(until) results = self.get_distgit_data(start_date, end_date) total_results = len(results) log.info('Successfully fetched {0} results from Teiid'.format( total_results)) # Overwrite results with the formatted results so we don't have to store both in RAM results = list(self._get_result_chunks(results)) # Upload the results to Neo4j using multi-processing to process chunks of results. We don't # use pool so that way the process doesn't get reused and the RAM is returned to the OS. # This will aid in a work-around for a memory leak from one of the libraries used that # couldn't be tracked down. procs = [] concurrent_procs = 2 for i, result in enumerate(results): # Only check if we've reached the process limit after it's technically possible if i >= concurrent_procs: active_procs = [_proc for _proc in procs if _proc.is_alive()] if len(active_procs) >= concurrent_procs: log.debug( 'There are already {0} processes running. Will wait until one of ' 'them completes.'.format(len(active_procs))) active_procs[0].join() proc = Process(target=self._update_neo4j, args=(neomodel_config.DATABASE_URL, total_results, result)) proc.start() procs.append(proc) for proc in procs: # Wait for all the processes to finish proc.join() log.info('Initial load of dist-git commits complete!')
def run(self, since=None, until=None): """ Run the Bugzilla scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load of Bugzilla bugs') if since is None: start_date = self.default_since else: start_date = timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = timestamp_to_date(until) bugs = self.get_bugzilla_bugs(start_date, end_date) log.info('Successfully fetched {0} bugs from teiid'.format(len(bugs))) self.update_neo4j(bugs) log.info('Initial load of Bugzilla bugs complete!')
def run(self, since=None, until=None): """ Run the Errata Tool scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load of Errata advisories') if since is None: start_date = self.default_since else: start_date = timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = timestamp_to_date(until) advisories = self.get_advisories(start_date, end_date) log.info('Successfully fetched {0} advisories from Teiid'.format( len(advisories))) self.update_neo4j(advisories) log.info('Initial load of Errata advisories complete!')
def run(self, since=None, until=None): """ Run the dist-git scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load of dist-git commits and pushes') if since is None: start_date = self.default_since else: start_date = timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = timestamp_to_date(until) results = self.get_distgit_data(start_date, end_date) log.info('Successfully fetched {0} results from Teiid'.format( len(results))) self.update_neo4j(results) log.info('Initial load of dist-git commits and pushes complete!')
def run(self, since=None, until=None): """ Run the Koji scraper. :param str since: a datetime to start scraping data from :param str until: a datetime to scrape data until """ log.info('Starting initial load for Koji') # Initialize a start date from which all builds must be fetched # If no input is given by the user, fetch builds from the past two years if since is None: start_date = self.default_since else: start_date = utils.timestamp_to_date(since) if until is None: end_date = self.default_until else: end_date = utils.timestamp_to_date(until) builds = self.get_koji_builds(start_date, end_date) log.info('Successfully fetched {0} builds from teiid'.format(len(builds))) self.update_neo4j(builds) log.info('Initial load of Koji builds complete!')
def test_timestamp_to_date(input_date, expected_date): """Test that a timestamp can be converted to a date object.""" assert timestamp_to_date(input_date) == expected_date