Beispiel #1
0
def test_union_method_and_occurrence_count_1():
    """Check the method KeywordsSet.union() and occurrence count."""
    keywordsSet1 = KeywordsSet()
    keywordsSet1.add("keyword1")

    keywordsSet2 = KeywordsSet()
    keywordsSet2.add("keyword2")

    # pre-operation checkeywordsSet
    assert "keyword1" in keywordsSet1.keywords
    assert "keyword2" not in keywordsSet1.keywords
    assert keywordsSet1.keywords["keyword1"]["occurrence_count"] == 1

    assert "keyword1" not in keywordsSet2.keywords
    assert "keyword2" in keywordsSet2.keywords
    assert keywordsSet2.keywords["keyword2"]["occurrence_count"] == 1

    keywordsSet1.union(keywordsSet2)
    # post-operation checkeywordsSet
    assert "keyword1" in keywordsSet1.keywords
    assert "keyword2" in keywordsSet1.keywords
    assert keywordsSet1.keywords["keyword1"]["occurrence_count"] == 1
    assert keywordsSet1.keywords["keyword2"]["occurrence_count"] == 1

    assert "keyword1" not in keywordsSet2.keywords
    assert "keyword2" in keywordsSet2.keywords
    def execute(self, ignore_errors=True, use_progressbar=False):
        """Collect PyPI keywords."""
        keywords_set = KeywordsSet()
        _logger.debug("Fetching StackOverflow")

        response = requests.get(self._STACKOVERFLOW_URL)
        if response.ok is not True:
            raise RuntimeError(
                "Failed to fetch '%s', request ended with status code %s" %
                (self._STACKOVERFLOW_URL, response.status_code))

        tags = None
        _logger.debug("Unpacking StackOverflow's tags archive")
        with libarchive.memory_reader(response.content) as archive:
            for entry in archive:
                if entry.name == 'Tags.xml':
                    tags = xmltodict.parse(b"".join(entry.get_blocks()))
                    break

        for tag in tags['tags']['row']:
            try:
                keywords_set.add(tag['@TagName'], int(tag['@Count']))
            except ValueError:
                _logger.warning(
                    "Failed to parse number of occurrences for tag %s",
                    tag['@TagName'])
                continue
            except KeyError:
                _logger.exception("Missing tagname or tag count")
                continue

        return keywords_set
Beispiel #3
0
def test_union_method():
    """Check the method KeywordsSet.union()."""
    keywordsSet1 = KeywordsSet()
    keywordsSet1.add("keyword1")

    keywordsSet2 = KeywordsSet()
    keywordsSet2.add("keyword2")

    # pre-operation checkeywordsSet
    assert len(keywordsSet1.keywords) == 1
    assert len(keywordsSet2.keywords) == 1

    assert "keyword1" in keywordsSet1.keywords
    assert "keyword2" not in keywordsSet1.keywords

    assert "keyword1" not in keywordsSet2.keywords
    assert "keyword2" in keywordsSet2.keywords

    keywordsSet1.union(keywordsSet2)
    # post-operation checkeywordsSet
    assert len(keywordsSet1.keywords) == 2
    assert len(keywordsSet2.keywords) == 1

    assert "keyword1" in keywordsSet1.keywords
    assert "keyword2" in keywordsSet1.keywords

    assert "keyword1" not in keywordsSet2.keywords
    assert "keyword2" in keywordsSet2.keywords
    def execute(self, ignore_errors=True, use_progressbar=False):
        """Collect Maven keywords."""
        keywords_set = KeywordsSet()

        _logger.debug("Fetching Maven and executing Maven index checker")
        maven_index_checker_dir = get_files_dir()
        maven_index_checker_jar = path.join(maven_index_checker_dir,
                                            "maven-index-checker.jar")

        if not path.isfile(maven_index_checker_jar):
            raise InstallPrepareError(
                "Maven index checker was not found in '%s', did you forget "
                "to run prepare()?" % maven_index_checker_jar)

        with cwd(maven_index_checker_dir):
            # This requires at least  4GB of free space on /tmp partition
            packages = loads(
                check_output(['java', '-jar', maven_index_checker_jar, '-it']))

        for package in packages:
            del package['version']
        packages = [
            dict(s) for s in set(frozenset(d.items()) for d in packages)
        ]

        _logger.debug("started fetching data from mvnrepository.com")
        try:
            for package in progressbarize(packages, use_progressbar):
                package_name = package['groupId'] + '/' + package['artifactId']
                response = get(self._MVNREPOSITORY_URL + package_name)
                if response.ok is not True:
                    error_msg = "Failed to retrieve package information for '{}', " \
                                "response status code: {}". \
                        format(package_name, response.status_code)
                    if ignore_errors:
                        _logger.error(error_msg)
                        continue
                    raise RuntimeError(error_msg)

                soup = BeautifulSoup(response.text, 'lxml')
                for i in soup.find_all(class_="b tag"):
                    keywords_set.add(i.text)

                # It seems that mvnrepository has limit for 2000 requests per hour
                # so sleeping 2 seconds of sleep should do the trick
                sleep(2)
        finally:
            # Clean unpacked maven index after executing
            _logger.debug("Cleaning unpacked maven index")
            rmtree(path.join(maven_index_checker_dir, "target"))

        return keywords_set
    def execute(self, ignore_errors=True, use_progressbar=False):
        """Collect PyPI keywords."""
        keywords_set = KeywordsSet()

        _logger.debug("Fetching PyPI")
        response = requests.get(self._PYPI_SIMPLE_URL)
        if response.status_code != 200:
            raise RuntimeError(
                "Failed to fetch '%s', request ended with status code %s" %
                (self._PYPI_SIMPLE_URL, response.status_code))

        soup = BeautifulSoup(response.text, 'lxml')
        for link in progressbarize(soup.find_all('a'), use_progressbar):
            package_name = link.text
            url = urljoin(self._PACKAGE_BASE_URL, package_name)
            response = requests.get(url)
            if response.status_code != 200:
                error_msg = "Failed to retrieve package information for '{}', " \
                            "response status code: {}".\
                    format(package_name, response.status_code)
                if ignore_errors:
                    _logger.error(error_msg)
                    continue
                raise RuntimeError(error_msg)

            package_soup = BeautifulSoup(response.text, 'lxml')
            # meta_keywords = package_soup.find_all('meta', attrs={'name': 'keywords'})
            meta_keywords = package_soup.find_all('p', attrs={'class': 'tags'})
            if len(meta_keywords) < 1:
                warn_msg = "Failed to parse and find keywords for '%s'" % package_name
                _logger.warning(warn_msg)
                continue

            # some packages have comma hardcoded in the keywords list, split keywords there as well
            found_keywords = []
            keywords_spans = meta_keywords[0].find_all(
                'span', attrs={'class': 'package-keyword'})
            for span in keywords_spans:
                for word in span.contents:
                    found_keywords += [
                        k.strip().lower() for k in word.split(',')
                        if k.strip() != ""
                    ]

            _logger.debug("Found keywords %s in '%s'", found_keywords,
                          package_name)

            for keyword in set(found_keywords):
                keywords_set.add(keyword)

        return keywords_set
Beispiel #6
0
def test_union_method_for_empty_data():
    """Check the method KeywordsSet.union() for empty data."""
    keywordsSet1 = KeywordsSet()

    keywordsSet2 = KeywordsSet()

    # pre-operation checkeywordsSet
    assert keywordsSet1.keywords == {}
    assert keywordsSet2.keywords == {}

    keywordsSet1.union(keywordsSet2)
    # post-operation checkeywordsSet
    assert keywordsSet1.keywords == {}
    assert keywordsSet2.keywords == {}
Beispiel #7
0
def test_occurrence_counting():
    """Check occurrence counting."""
    keywordsSet = KeywordsSet()
    keywordsSet.add("keyword")
    assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1
    keywordsSet.add("keyword")
    assert keywordsSet.keywords["keyword"]["occurrence_count"] == 2
    keywordsSet.add("keyword", 10)
    assert keywordsSet.keywords["keyword"]["occurrence_count"] == 12
Beispiel #8
0
def test_union_method_for_overlapping_data():
    """Check the method KeywordsSet.union() for overlapping data."""
    keywordsSet1 = KeywordsSet()
    keywordsSet1.add("keyword", 10)

    keywordsSet2 = KeywordsSet()
    keywordsSet2.add("keyword", 20)

    # pre-operation checkeywordsSet
    assert "keyword" in keywordsSet1.keywords
    assert "keyword" in keywordsSet1.keywords
    assert keywordsSet1.keywords["keyword"]["occurrence_count"] == 10
    assert keywordsSet2.keywords["keyword"]["occurrence_count"] == 20

    keywordsSet1.union(keywordsSet2)
    # post-operation checkeywordsSet
    assert "keyword" in keywordsSet1.keywords
    assert keywordsSet1.keywords["keyword"]["occurrence_count"] == 10
def collect(collector=None, ignore_errors=False, use_progressbar=False):
    """Collect keywords from external resources.

    :param collector: a list/tuple of collectors to be used
    :param ignore_errors: if True, ignore all errors, but report them
    :param use_progressbar: use progressbar if True
    :return: all collected keywords
    """
    keywords_set = KeywordsSet()
    for col in (collector or CollectorBase.get_registered_collectors()):  # pylint: disable=superfluous-parens # noqa
        try:
            collector_instance = CollectorBase.get_collector_class(col)()
            keywords_set.union(collector_instance.execute(ignore_errors, use_progressbar))
        except Exception as exc:
            if ignore_errors:
                _logger.exception("Collection of keywords for '%s' failed: %s" % (col, str(exc)))
                continue
            raise

    return keywords_set.keywords
Beispiel #10
0
def test_add_method():
    """Check the method KeywordsSet.add()."""
    keywordsSet = KeywordsSet()
    assert keywordsSet
    assert keywordsSet.keywords == {}

    # first keyword
    keywordsSet.add("keyword")
    assert "keyword" in keywordsSet.keywords
    assert len(keywordsSet.keywords) == 1
    assert "occurrence_count" in keywordsSet.keywords["keyword"]
    assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1

    # second keyword
    keywordsSet.add("keyword2", 42)

    # check the firts and the second keyword as well
    assert "keyword2" in keywordsSet.keywords
    assert len(keywordsSet.keywords) == 2
    assert "occurrence_count" in keywordsSet.keywords["keyword"]
    assert keywordsSet.keywords["keyword"]["occurrence_count"] == 1
    assert "keyword2" in keywordsSet.keywords
    assert "occurrence_count" in keywordsSet.keywords["keyword2"]
    assert keywordsSet.keywords["keyword2"]["occurrence_count"] == 42
Beispiel #11
0
def test_initial_state():
    """Check the initial state of KeywordsSet."""
    keywordsSet = KeywordsSet()
    assert keywordsSet
    assert keywordsSet.keywords == {}