def parse_search_results(self, response): """Parse `GitHub`'s search results. """ repo_link_xpath = ('//ul[@class="repo-list js-repo-list"]' '/li/h3/a') repo_urls = set(response.xpath(('{0}/@href'.format(repo_link_xpath)))\ .extract()) for project in PROJECTS.itervalues(): if project.short_url not in repo_urls: self.logger.error(('NOT FIND {0} in repos. URLs! Skipping ' 'project `{1}`...').format(project.short_url, project.name)) continue # else: crawled_infos = CrawledInfos(project_name=project.name) link_xpath = '{0}[@href="{1}"]'.format(repo_link_xpath, project.short_url) next_url = list(extract_links(response, xpaths=link_xpath))[0] request = Request(next_url, callback=self.parse_project, meta={ 'crawled_infos': crawled_infos, 'project': project, }) self.requests.append(request) yield request
def parse_directory(self, response): """Parse project's given directory. :meta crawled_infos: currently crawled informations :type crawled_infos: :class:`CrawledInfos` :meta project: current crawled project :type project: :class:`Project` """ self.responses.append(response) crawled_infos = response.meta['crawled_infos'] project = response.meta['project'] item_xpath = '//a[@class="js-directory-link js-navigation-open"]' dir_items = set(response.xpath('{0}/text()'.format(item_xpath))\ .extract()) for filename in project.dirs[crawled_infos.current_dir]: if filename not in dir_items: self.logger.error(('NOT FIND `{0}` file in `{1}` directory ' '`{2}`! Skipping item...' '').format(filename, project.short_url, crawled_infos.current_dir)) continue _crawled_infos = \ crawled_infos.clone_and_update(filename=filename) link_xpath = '{0}[text()="{1}"]'.format(item_xpath, filename) next_url = list(extract_links(response, xpaths=link_xpath))[0] request = Request(next_url, callback=self.parse_file, meta={ 'crawled_infos': _crawled_infos, 'project': project, }) self.requests.append(request) yield request