class Fyle: def __init__(self): self.arguments = Arguments() self.filename = self.arguments.get_organization() def write(self, content): file = open(f'{self.filename}.txt', 'a') file.write(f'{content}\n') # python will convert \n to os.linesep file.close()
class GithubCrawl: def __init__(self): self.arguments = Arguments() self.organization = self.arguments.get_organization() self.query = self.arguments.get_search_query() self.fyle = Fyle() self.process_timer = ProcessTimer() self.github_search = GithubSearch(self.organization, self.query) def get_elapsed_minutes(self): return self.process_timer.get_elapsed_minutes() def start(self): self.total_pages = self.github_search.result_total_pages() self.url = self.github_search.get_url() print(f'\n\nTotal Pages: {self.total_pages}') for page_number in range(START_PAGE_NUMBER, self.total_pages): print(f'\nProcessing: {self.url}\nPage number: {page_number}\n') self.process_page(page_number) def process_page(self, page_number): results = self.github_search.page_result(page_number) if not results: return None pool = Pool(MAX_THREADS) for result in results: pool.apply_async(self.search_content, (result, )) pool.daemon = True pool.close() pool.join() def search_content(self, result): if self.is_archived(result): return None url = result.get_item_url() request_process = RequestProcess(url) response = request_process.get() github_response_parser = GithubResponseParser(response) content = github_response_parser.get_page_content() search_content = SearchContent(content) search_content.extract_urls() def is_archived(self, result): repository_url = result.get_repository_url() if not repository_url: return False request_process = RequestProcess(repository_url) response = request_process.get() github_response_parser = GithubResponseParser(response) return github_response_parser.get_archived_information()