Esempio n. 1
0
class GithubCrawl:
    def __init__(self):

        self.arguments = Arguments()
        self.organization = self.arguments.get_organization()
        self.query = self.arguments.get_search_query()

        self.fyle = Fyle()
        self.process_timer = ProcessTimer()
        self.github_search = GithubSearch(self.organization, self.query)

    def get_elapsed_minutes(self):

        return self.process_timer.get_elapsed_minutes()

    def start(self):

        self.total_pages = self.github_search.result_total_pages()
        self.url = self.github_search.get_url()

        print(f'\n\nTotal Pages: {self.total_pages}')

        for page_number in range(START_PAGE_NUMBER, self.total_pages):

            print(f'\nProcessing: {self.url}\nPage number: {page_number}\n')

            self.process_page(page_number)

    def process_page(self, page_number):

        results = self.github_search.page_result(page_number)
        if not results:
            return None

        pool = Pool(MAX_THREADS)

        for result in results:
            pool.apply_async(self.search_content, (result, ))

        pool.daemon = True
        pool.close()
        pool.join()

    def search_content(self, result):

        if self.is_archived(result):
            return None

        url = result.get_item_url()
        request_process = RequestProcess(url)
        response = request_process.get()

        github_response_parser = GithubResponseParser(response)
        content = github_response_parser.get_page_content()

        search_content = SearchContent(content)
        search_content.extract_urls()

    def is_archived(self, result):

        repository_url = result.get_repository_url()

        if not repository_url:
            return False

        request_process = RequestProcess(repository_url)

        response = request_process.get()
        github_response_parser = GithubResponseParser(response)

        return github_response_parser.get_archived_information()
Esempio n. 2
0
class SearchContent:

    def __init__(self, content):

        self.fyle = Fyle()
        self.arguments = Arguments()

        self.search_query = self.arguments.get_search_query()
        self.regex = self.format_regex()

        self.content = self.decode_base64(content)
        self.matches = []


    def format_regex(self):
        
        if ALL_URLS_SEARCH:
            return self.format_full_url_regex()

        return self.format_custom_url_regex()
        

    def format_custom_url_regex(self):

        return URL_REGEX_START + re.escape(self.search_query) + URL_REGEX_END


    def format_full_url_regex(self):

        return URL_REGEX


    def decode_base64(self, content):

        try:
            return base64.b64decode(content).decode(DECODE_FORMAT) 
        except Exception as e:
            print(e)
            return None


    def extract_urls(self):

        if GIT_PROTOCOL in self.content:
            self.content = self.content.replace(GIT_PROTOCOL, HTTPS_PROTOCOL)

        for iterator in re.finditer(self.regex, self.content):

            url = iterator.group()
            if url not in self.matches:

                url = self.format_url(url)

                print(f'Found {url}')
                self.fyle.write(url)
                self.matches.append(url)

        return self.matches


    def get_matches_string(self):

        if len(self.matches) == 0:
            return None

        return DELIMITER.join(self.matches)

    
    def format_url(self, url):

        if not url.startswith(HTTP):
            url = f'{HTTPS_PROTOCOL}{url}'

        if url.startswith(RAW_GITHUB_URL):
            url = url.replace(RAW_GITHUB_URL, GITHUB_URL)

        return url