class GithubCrawl: def __init__(self): self.arguments = Arguments() self.organization = self.arguments.get_organization() self.query = self.arguments.get_search_query() self.fyle = Fyle() self.process_timer = ProcessTimer() self.github_search = GithubSearch(self.organization, self.query) def get_elapsed_minutes(self): return self.process_timer.get_elapsed_minutes() def start(self): self.total_pages = self.github_search.result_total_pages() self.url = self.github_search.get_url() print(f'\n\nTotal Pages: {self.total_pages}') for page_number in range(START_PAGE_NUMBER, self.total_pages): print(f'\nProcessing: {self.url}\nPage number: {page_number}\n') self.process_page(page_number) def process_page(self, page_number): results = self.github_search.page_result(page_number) if not results: return None pool = Pool(MAX_THREADS) for result in results: pool.apply_async(self.search_content, (result, )) pool.daemon = True pool.close() pool.join() def search_content(self, result): if self.is_archived(result): return None url = result.get_item_url() request_process = RequestProcess(url) response = request_process.get() github_response_parser = GithubResponseParser(response) content = github_response_parser.get_page_content() search_content = SearchContent(content) search_content.extract_urls() def is_archived(self, result): repository_url = result.get_repository_url() if not repository_url: return False request_process = RequestProcess(repository_url) response = request_process.get() github_response_parser = GithubResponseParser(response) return github_response_parser.get_archived_information()
class SearchContent: def __init__(self, content): self.fyle = Fyle() self.arguments = Arguments() self.search_query = self.arguments.get_search_query() self.regex = self.format_regex() self.content = self.decode_base64(content) self.matches = [] def format_regex(self): if ALL_URLS_SEARCH: return self.format_full_url_regex() return self.format_custom_url_regex() def format_custom_url_regex(self): return URL_REGEX_START + re.escape(self.search_query) + URL_REGEX_END def format_full_url_regex(self): return URL_REGEX def decode_base64(self, content): try: return base64.b64decode(content).decode(DECODE_FORMAT) except Exception as e: print(e) return None def extract_urls(self): if GIT_PROTOCOL in self.content: self.content = self.content.replace(GIT_PROTOCOL, HTTPS_PROTOCOL) for iterator in re.finditer(self.regex, self.content): url = iterator.group() if url not in self.matches: url = self.format_url(url) print(f'Found {url}') self.fyle.write(url) self.matches.append(url) return self.matches def get_matches_string(self): if len(self.matches) == 0: return None return DELIMITER.join(self.matches) def format_url(self, url): if not url.startswith(HTTP): url = f'{HTTPS_PROTOCOL}{url}' if url.startswith(RAW_GITHUB_URL): url = url.replace(RAW_GITHUB_URL, GITHUB_URL) return url