def get_repositories_info(query, number_of_repos): """ Sends search request by GitHub REST API. Processes respond and returns tuple of urls. Each url is leading to one repository. :param query: query string :param number_of_repos: max number of repositories to be downloaded :return: list of Repository objects :raise: ConnectionError if network problem :raise: TimeoutError if no response Usage: >>> r = RepositoryDownloader() >>> query = RESTUtils.create_search_query(searched_phrase='tetris', language='assembly') >>> r.get_repositories_info(query=query) """ # query_dict = {'q': query, 'sort': 'stars', 'order': 'desc'} assert (number_of_repos > 0) results_per_page = number_of_repos if number_of_repos > 30 else 30 # by default, we have 30 results per page max_page = 1000 // results_per_page # From GitHub API: "Only the first 1000 search results are available" query_dict = {'q': query, 'per_page': results_per_page} try: # first request is only to get total count of results req = RESTUtils.make_get_request(url='https://api.github.com/search/repositories', params=query_dict) # generate random page # number of pages. We omit last. It may be incomplete # 0 page is equal to 1 page pages = req.json()['total_count'] // results_per_page pages = max_page if pages > max_page else pages page = random.randint(1, pages) query_dict['page'] = page # add random page to query req = RESTUtils.make_get_request(url='https://api.github.com/search/repositories', params=query_dict) logging.info('Searching random repositories. ' 'Reguest {0}'.format(req.url) + 'Page {0}, count {1}'.format(page, number_of_repos)) except: raise items = req.json()['items'] number_of_repos = len(items) if number_of_repos > len(items) else number_of_repos items = random.sample(items, number_of_repos) # random elements from items return [Repository.from_json(item) for item in items]
def save_file(self, file_json): """ Downloads and saves file. Creates directory for file if needed. :param file_json: file information in JSON :return: :raise: ConnectionError if network problem :raise: TimeoutError if no response """ filepath = os.path.join(self._downloadDirectoryPath, self._repositoryDirectoryName, file_json['path']) os.makedirs(os.path.dirname(filepath), exist_ok=True) # create dir if needed try: with open(filepath, "wb") as out_file: req = RESTUtils.make_get_request(file_json['download_url']) out_file.write(req.content) logging.info('Saved file: ' + filepath) print(filepath) except TimeoutError as err: logging.warning('Timeout while downloading file {0}'.format(filepath) + 'Error message: {0}'.format(str(err))) try: os.remove(filepath) logging.warning('Deleting file {0}'.format(filepath)) except FileNotFoundError: pass print('TimeoutError. {0} is skipped'.format(filepath)) except ConnectionError as err: logging.error('ConnectionError while downloading file {0}'.format(filepath) + 'Error message: {0}'.format(str(err))) raise
def download_contents_from_url(self, url, number_of_files): """ Downloads content from url. :param url: url to download from :param number_of_files: max number of files to be downloaded :return: number of downloaded files :raise: ConnectionError if network problem :raise: TimeoutError if no response """ try: req = RESTUtils.make_get_request(url) except (ConnectionError, TimeoutError) as err: logging.error('Downloading content by REST request failed. ' 'Error msg {0}'.format(str(err))) raise return self.iterate_through_files_and_save_them(req.json(), number_of_files)
def download_repositories(self, language, search_phrase='', number_of_repositories=10, number_of_files_for_repo=5): """ Downloads found repositories. :param language: repository language :param search_phrase: phrase to be searched. If no search_phrase is given, search is done with language only. :param number_of_repositories: max number of repositories to be found :param number_of_files_for_repo: max number of files for each repository """ query = RESTUtils.create_search_query(searched_phrase=search_phrase, language=language) try: repos = RepositoryDownloader.get_repositories_info(query, number_of_repositories) logging.info('Repositories info downloaded.') except ConnectionError as err: logging.error('Network problem while downloading repositories info. ' 'Error message: {0}'.format(str(err))) print('Network problem. Repositories cannot be downloaded.') sys.exit(1) except TimeoutError as err: logging.error('Timeout while downloading repositories info. ' 'Error message: {0}'.format(str(err))) print('Timeout problem. Repository download time was excedeed.') sys.exit(1) except Exception as err: logging.error('Unknown exception while downloading repositories info. ' 'Error message: {0}'.format(str(err))) print('Unknown exception while downloading repositories info.') print('Error message: {0}'.format(str(err))) sys.exit(1) # noinspection PyShadowingNames def thread_target(repo, download_direcotry_path, file_extensions, number_of_files_for_repo): # exception bucket is a closure try: repo.download_repository(download_direcotry_path, file_extensions, number_of_files_for_repo) except ConnectionError as err: logging.error('Network problem while downloading repository content. ' 'Error message: {0}'.format(str(err))) exception_bucket.put(sys.exc_info()) except TimeoutError as err: logging.error('TimeoutError while downloading repository content. ' 'Error message: {0}'.format(str(err))) exception_bucket.put(sys.exc_info()) except Exception as err: logging.error('Unknown exception while downloading repositories info. ' 'Error message: {0}'.format(str(err))) exception_bucket.put(sys.exc_info()) threads = [] exception_bucket = queue.Queue() for repo in repos: t = threading.Thread(target=thread_target, args=(repo, self.downloadDirectoryPath, self.fileExtensions, number_of_files_for_repo), daemon=True) threads.append(t) for (t, repo) in zip(threads, repos): t.start() print('Thread for {name} {url} started.'.format(name=repo.full_name, url=repo.html_url)) logging.info('Thread for {name} {url} started.'.format(name=repo.full_name, url=repo.html_url)) print('All threads for downloading repositories started') logging.info('All threads for downloading repositories started') for (t, repo) in zip(threads, repos): t.join() try: exc = exception_bucket.get(block=False) exc_type, exc_obj, exc_trace = exc raise exc_obj except queue.Empty: pass except ConnectionError as err: logging.error('Network problem while downloading repository content. ' 'Error message: {0}'.format(str(err))) print('Network problem. Repository content cannot be downloaded. Program ends') sys.exit(1) except TimeoutError as err: logging.error('TimeoutError while downloading repository content. ' 'Error message: {0}'.format(str(err))) print('TimeoutError. Repository content cannot be downloaded. Program skips.') except Exception as err: logging.error('Unknown exception while downloading repositories info. ' 'Error message: {0}'.format(str(err))) print('Unknown exception while downloading repositories info.') print('Error message: {0}'.format(str(err))) sys.exit(1) print('Thread for {name} {url} joined.'.format(name=repo.full_name, url=repo.html_url)) logging.info('Thread for {name} {url} joined.'.format(name=repo.full_name, url=repo.html_url)) print('All threads for downloading repositories joined') logging.info('All threads for downloading repositories joined')