def execute(self, ignore_errors=True, use_progressbar=False): """Collect PyPI keywords.""" keywords_set = KeywordsSet() _logger.debug("Fetching StackOverflow") response = requests.get(self._STACKOVERFLOW_URL) if response.ok is not True: raise RuntimeError( "Failed to fetch '%s', request ended with status code %s" % (self._STACKOVERFLOW_URL, response.status_code)) tags = None _logger.debug("Unpacking StackOverflow's tags archive") with libarchive.memory_reader(response.content) as archive: for entry in archive: if entry.name == 'Tags.xml': tags = xmltodict.parse(b"".join(entry.get_blocks())) break for tag in tags['tags']['row']: try: keywords_set.add(tag['@TagName'], int(tag['@Count'])) except ValueError: _logger.warning( "Failed to parse number of occurrences for tag %s", tag['@TagName']) continue except KeyError: _logger.exception("Missing tagname or tag count") continue return keywords_set
def test_union_method_and_occurrence_count_1(): """Check the method KeywordsSet.union() and occurrence count.""" keywordsSet1 = KeywordsSet() keywordsSet1.add("keyword1") keywordsSet2 = KeywordsSet() keywordsSet2.add("keyword2") # pre-operation checkeywordsSet assert "keyword1" in keywordsSet1.keywords assert "keyword2" not in keywordsSet1.keywords assert keywordsSet1.keywords["keyword1"]["occurrence_count"] == 1 assert "keyword1" not in keywordsSet2.keywords assert "keyword2" in keywordsSet2.keywords assert keywordsSet2.keywords["keyword2"]["occurrence_count"] == 1 keywordsSet1.union(keywordsSet2) # post-operation checkeywordsSet assert "keyword1" in keywordsSet1.keywords assert "keyword2" in keywordsSet1.keywords assert keywordsSet1.keywords["keyword1"]["occurrence_count"] == 1 assert keywordsSet1.keywords["keyword2"]["occurrence_count"] == 1 assert "keyword1" not in keywordsSet2.keywords assert "keyword2" in keywordsSet2.keywords
def test_union_method(): """Check the method KeywordsSet.union().""" keywordsSet1 = KeywordsSet() keywordsSet1.add("keyword1") keywordsSet2 = KeywordsSet() keywordsSet2.add("keyword2") # pre-operation checkeywordsSet assert len(keywordsSet1.keywords) == 1 assert len(keywordsSet2.keywords) == 1 assert "keyword1" in keywordsSet1.keywords assert "keyword2" not in keywordsSet1.keywords assert "keyword1" not in keywordsSet2.keywords assert "keyword2" in keywordsSet2.keywords keywordsSet1.union(keywordsSet2) # post-operation checkeywordsSet assert len(keywordsSet1.keywords) == 2 assert len(keywordsSet2.keywords) == 1 assert "keyword1" in keywordsSet1.keywords assert "keyword2" in keywordsSet1.keywords assert "keyword1" not in keywordsSet2.keywords assert "keyword2" in keywordsSet2.keywords
def execute(self, ignore_errors=True, use_progressbar=False): """Collect Maven keywords.""" keywords_set = KeywordsSet() _logger.debug("Fetching Maven and executing Maven index checker") maven_index_checker_dir = get_files_dir() maven_index_checker_jar = path.join(maven_index_checker_dir, "maven-index-checker.jar") if not path.isfile(maven_index_checker_jar): raise InstallPrepareError( "Maven index checker was not found in '%s', did you forget " "to run prepare()?" % maven_index_checker_jar) with cwd(maven_index_checker_dir): # This requires at least 4GB of free space on /tmp partition packages = loads( check_output(['java', '-jar', maven_index_checker_jar, '-it'])) for package in packages: del package['version'] packages = [ dict(s) for s in set(frozenset(d.items()) for d in packages) ] _logger.debug("started fetching data from mvnrepository.com") try: for package in progressbarize(packages, use_progressbar): package_name = package['groupId'] + '/' + package['artifactId'] response = get(self._MVNREPOSITORY_URL + package_name) if response.ok is not True: error_msg = "Failed to retrieve package information for '{}', " \ "response status code: {}". \ format(package_name, response.status_code) if ignore_errors: _logger.error(error_msg) continue raise RuntimeError(error_msg) soup = BeautifulSoup(response.text, 'lxml') for i in soup.find_all(class_="b tag"): keywords_set.add(i.text) # It seems that mvnrepository has limit for 2000 requests per hour # so sleeping 2 seconds of sleep should do the trick sleep(2) finally: # Clean unpacked maven index after executing _logger.debug("Cleaning unpacked maven index") rmtree(path.join(maven_index_checker_dir, "target")) return keywords_set
def execute(self, ignore_errors=True, use_progressbar=False): """Collect PyPI keywords.""" keywords_set = KeywordsSet() _logger.debug("Fetching PyPI") response = requests.get(self._PYPI_SIMPLE_URL) if response.status_code != 200: raise RuntimeError( "Failed to fetch '%s', request ended with status code %s" % (self._PYPI_SIMPLE_URL, response.status_code)) soup = BeautifulSoup(response.text, 'lxml') for link in progressbarize(soup.find_all('a'), use_progressbar): package_name = link.text url = urljoin(self._PACKAGE_BASE_URL, package_name) response = requests.get(url) if response.status_code != 200: error_msg = "Failed to retrieve package information for '{}', " \ "response status code: {}".\ format(package_name, response.status_code) if ignore_errors: _logger.error(error_msg) continue raise RuntimeError(error_msg) package_soup = BeautifulSoup(response.text, 'lxml') # meta_keywords = package_soup.find_all('meta', attrs={'name': 'keywords'}) meta_keywords = package_soup.find_all('p', attrs={'class': 'tags'}) if len(meta_keywords) < 1: warn_msg = "Failed to parse and find keywords for '%s'" % package_name _logger.warning(warn_msg) continue # some packages have comma hardcoded in the keywords list, split keywords there as well found_keywords = [] keywords_spans = meta_keywords[0].find_all( 'span', attrs={'class': 'package-keyword'}) for span in keywords_spans: for word in span.contents: found_keywords += [ k.strip().lower() for k in word.split(',') if k.strip() != "" ] _logger.debug("Found keywords %s in '%s'", found_keywords, package_name) for keyword in set(found_keywords): keywords_set.add(keyword) return keywords_set
def test_union_method_for_overlapping_data(): """Check the method KeywordsSet.union() for overlapping data.""" keywordsSet1 = KeywordsSet() keywordsSet1.add("keyword", 10) keywordsSet2 = KeywordsSet() keywordsSet2.add("keyword", 20) # pre-operation checkeywordsSet assert "keyword" in keywordsSet1.keywords assert "keyword" in keywordsSet1.keywords assert keywordsSet1.keywords["keyword"]["occurrence_count"] == 10 assert keywordsSet2.keywords["keyword"]["occurrence_count"] == 20 keywordsSet1.union(keywordsSet2) # post-operation checkeywordsSet assert "keyword" in keywordsSet1.keywords assert keywordsSet1.keywords["keyword"]["occurrence_count"] == 10
def test_occurrence_counting(): """Check occurrence counting.""" keywordsSet = KeywordsSet() keywordsSet.add("keyword") assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1 keywordsSet.add("keyword") assert keywordsSet.keywords["keyword"]["occurrence_count"] == 2 keywordsSet.add("keyword", 10) assert keywordsSet.keywords["keyword"]["occurrence_count"] == 12
def test_add_method(): """Check the method KeywordsSet.add().""" keywordsSet = KeywordsSet() assert keywordsSet assert keywordsSet.keywords == {} # first keyword keywordsSet.add("keyword") assert "keyword" in keywordsSet.keywords assert len(keywordsSet.keywords) == 1 assert "occurrence_count" in keywordsSet.keywords["keyword"] assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1 # second keyword keywordsSet.add("keyword2", 42) # check the firts and the second keyword as well assert "keyword2" in keywordsSet.keywords assert len(keywordsSet.keywords) == 2 assert "occurrence_count" in keywordsSet.keywords["keyword"] assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1 assert "keyword2" in keywordsSet.keywords assert "occurrence_count" in keywordsSet.keywords["keyword2"] assert keywordsSet.keywords["keyword2"]["occurrence_count"] == 42