コード例 #1
0
ファイル: fyle.py プロジェクト: arshadkazmi42/gh-crawl
class Fyle:


    def __init__(self):

        self.arguments = Arguments()

        self.filename = self.arguments.get_organization()


    def write(self, content):

        file = open(f'{self.filename}.txt', 'a')
        file.write(f'{content}\n')  # python will convert \n to os.linesep
        file.close()
コード例 #2
0
class GithubCrawl:
    def __init__(self):

        self.arguments = Arguments()
        self.organization = self.arguments.get_organization()
        self.query = self.arguments.get_search_query()

        self.fyle = Fyle()
        self.process_timer = ProcessTimer()
        self.github_search = GithubSearch(self.organization, self.query)

    def get_elapsed_minutes(self):

        return self.process_timer.get_elapsed_minutes()

    def start(self):

        self.total_pages = self.github_search.result_total_pages()
        self.url = self.github_search.get_url()

        print(f'\n\nTotal Pages: {self.total_pages}')

        for page_number in range(START_PAGE_NUMBER, self.total_pages):

            print(f'\nProcessing: {self.url}\nPage number: {page_number}\n')

            self.process_page(page_number)

    def process_page(self, page_number):

        results = self.github_search.page_result(page_number)
        if not results:
            return None

        pool = Pool(MAX_THREADS)

        for result in results:
            pool.apply_async(self.search_content, (result, ))

        pool.daemon = True
        pool.close()
        pool.join()

    def search_content(self, result):

        if self.is_archived(result):
            return None

        url = result.get_item_url()
        request_process = RequestProcess(url)
        response = request_process.get()

        github_response_parser = GithubResponseParser(response)
        content = github_response_parser.get_page_content()

        search_content = SearchContent(content)
        search_content.extract_urls()

    def is_archived(self, result):

        repository_url = result.get_repository_url()

        if not repository_url:
            return False

        request_process = RequestProcess(repository_url)

        response = request_process.get()
        github_response_parser = GithubResponseParser(response)

        return github_response_parser.get_archived_information()