def test_progressbarsize(): """Test the progressbarize function.""" x = range(10) y = progressbarize(x) assert x == y z = progressbarize(x, progress=True) assert z
def aggregate(input_keywords_file, no_synonyms=None, use_progressbar=False): # pylint: disable=too-many-branches """Aggregate available topics. :param input_keywords_file: a list/tuple of input keywords files to process :param no_synonyms: do not compute synonyms for keywords :param use_progressbar: use progressbar to report progress :return: """ if not input_keywords_file: raise ValueError('No input keywords files provided') all_keywords = {} for input_file in progressbarize(input_keywords_file or [], use_progressbar): input_content = anymarkup.parse_file(input_file) for keyword, value in input_content.items(): keyword = str(keyword) if not KeywordsChief.matches_keyword_pattern(keyword): _logger.debug( "Dropping keyword '%s' as it does not match keyword pattern.", keyword) continue if keyword in all_keywords.keys( ) and value is not None and all_keywords[keyword] is not None: all_keywords[keyword]['occurrence_count'] = value.pop('occurrence_count', 0) +\ all_keywords[keyword].get('occurrence_count', 0) for conf, items in value.items(): all_keywords[keyword][str(conf)] = list( set(items or []) | set(all_keywords[keyword][str(conf)] or [])) else: all_keywords[keyword] = value if value is not None else {} if not no_synonyms: synonyms = list( set(all_keywords[keyword].get('synonyms') or []) | set(KeywordsChief.compute_synonyms(keyword))) if synonyms: if all_keywords[str(keyword)] is None: all_keywords[str(keyword)] = {} all_keywords[str(keyword)]['synonyms'] = synonyms # filter out keywords with low occurrence count if defaults.OCCURRENCE_COUNT_FILTER > 1: result = {} for keyword, value in all_keywords.items(): if value.get('occurrence_count', 1) > defaults.OCCURRENCE_COUNT_FILTER: result[keyword] = value return result return all_keywords
def lookup_file(path, keywords_file=None, stopwords_file=None, ignore_errors=False, ngram_size=None, use_progressbar=False, lemmatize=False, stemmer=None, scorer=None): # pylint: disable=too-many-arguments,too-many-locals """Perform keywords lookup on a file or directory tree of files. :param path: path of directory tree or file on which the lookup should be done :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped :param ngram_size: size of ngrams, if None, ngram size is computed :param use_progressbar: True if progressbar should be shown :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str :param scorer: scorer to be used :type scorer: f8a_tagger.scoring.Scoring :return: found keywords, reported per file """ ret = {} ngram_size, tokenizer, chief, core_parser = _prepare_lookup( keywords_file, stopwords_file, ngram_size, lemmatize, stemmer) for project, file in progressbarize(iter_files(path, ignore_errors), progress=use_progressbar): file_name = file if not isinstance(file, str): file_name = file.name _logger.info("Processing file '%s' for project '%s'", file_name, project) try: content = core_parser.parse_file(file_name) keywords = _perform_lookup(content, tokenizer, chief, scorer) except Exception as exc: # pylint: disable=broad-except if not ignore_errors: raise _logger.exception("Failed to parse content in file '%s': %s", file_name, str(exc)) continue finally: # Remove temporary file here so we can use safely progressbar if not isinstance(file, str): _logger.debug("Removing temporary file '%s' for project '%s'", file_name, project) os.remove(file_name) ret[project] = keywords return ret
def execute(self, ignore_errors=True, use_progressbar=False): """Collect Maven keywords.""" keywords_set = KeywordsSet() _logger.debug("Fetching Maven and executing Maven index checker") maven_index_checker_dir = get_files_dir() maven_index_checker_jar = path.join(maven_index_checker_dir, "maven-index-checker.jar") if not path.isfile(maven_index_checker_jar): raise InstallPrepareError( "Maven index checker was not found in '%s', did you forget " "to run prepare()?" % maven_index_checker_jar) with cwd(maven_index_checker_dir): # This requires at least 4GB of free space on /tmp partition packages = loads( check_output(['java', '-jar', maven_index_checker_jar, '-it'])) for package in packages: del package['version'] packages = [ dict(s) for s in set(frozenset(d.items()) for d in packages) ] _logger.debug("started fetching data from mvnrepository.com") try: for package in progressbarize(packages, use_progressbar): package_name = package['groupId'] + '/' + package['artifactId'] response = get(self._MVNREPOSITORY_URL + package_name) if response.ok is not True: error_msg = "Failed to retrieve package information for '{}', " \ "response status code: {}". \ format(package_name, response.status_code) if ignore_errors: _logger.error(error_msg) continue raise RuntimeError(error_msg) soup = BeautifulSoup(response.text, 'lxml') for i in soup.find_all(class_="b tag"): keywords_set.add(i.text) # It seems that mvnrepository has limit for 2000 requests per hour # so sleeping 2 seconds of sleep should do the trick sleep(2) finally: # Clean unpacked maven index after executing _logger.debug("Cleaning unpacked maven index") rmtree(path.join(maven_index_checker_dir, "target")) return keywords_set
def execute(self, ignore_errors=True, use_progressbar=False): """Collect PyPI keywords.""" keywords_set = KeywordsSet() _logger.debug("Fetching PyPI") response = requests.get(self._PYPI_SIMPLE_URL) if response.status_code != 200: raise RuntimeError( "Failed to fetch '%s', request ended with status code %s" % (self._PYPI_SIMPLE_URL, response.status_code)) soup = BeautifulSoup(response.text, 'lxml') for link in progressbarize(soup.find_all('a'), use_progressbar): package_name = link.text url = urljoin(self._PACKAGE_BASE_URL, package_name) response = requests.get(url) if response.status_code != 200: error_msg = "Failed to retrieve package information for '{}', " \ "response status code: {}".\ format(package_name, response.status_code) if ignore_errors: _logger.error(error_msg) continue raise RuntimeError(error_msg) package_soup = BeautifulSoup(response.text, 'lxml') # meta_keywords = package_soup.find_all('meta', attrs={'name': 'keywords'}) meta_keywords = package_soup.find_all('p', attrs={'class': 'tags'}) if len(meta_keywords) < 1: warn_msg = "Failed to parse and find keywords for '%s'" % package_name _logger.warning(warn_msg) continue # some packages have comma hardcoded in the keywords list, split keywords there as well found_keywords = [] keywords_spans = meta_keywords[0].find_all( 'span', attrs={'class': 'package-keyword'}) for span in keywords_spans: for word in span.contents: found_keywords += [ k.strip().lower() for k in word.split(',') if k.strip() != "" ] _logger.debug("Found keywords %s in '%s'", found_keywords, package_name) for keyword in set(found_keywords): keywords_set.add(keyword) return keywords_set
def lookup(path, keywords_file=None, stopwords_file=None, ignore_errors=False, ngram_size=None, use_progressbar=False, lemmatize=False, stemmer=None): # pylint: disable=too-many-arguments,too-many-locals """Perform keywords lookup. :param path: path of directory tree or file on which the lookup should be done :param keywords_file: keywords file to be used :param stopwords_file: stopwords file to be used :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped :param ngram_size: size of ngrams, if None, ngram size is computed :param use_progressbar: True if progressbar should be shown :param lemmatize: use lemmatizer :type lemmatize: bool :param stemmer: stemmer to be used :type stemmer: str :return: found keywords, reported per file """ ret = {} stemmer_instance = Stemmer.get_stemmer( stemmer) if stemmer is not None else None lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None chief = KeywordsChief(keywords_file, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) computed_ngram_size = chief.compute_ngram_size() if ngram_size is not None and computed_ngram_size > ngram_size: _logger.warning( "Computed ngram size (%d) does not reflect supplied ngram size (%d), " "some synonyms will be omitted", chief.compute_ngram_size(), ngram_size) elif ngram_size is None: ngram_size = computed_ngram_size tokenizer = Tokenizer(stopwords_file, ngram_size, lemmatizer=lemmatizer_instance, stemmer=stemmer_instance) for file in progressbarize(iter_files(path, ignore_errors), progress=use_progressbar): _logger.info("Processing file '%s'", file) try: content = CoreParser().parse_file(file) tokens = tokenizer.tokenize(content) # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens). tokens = chain(*tokens) keywords = chief.extract_keywords(tokens) except Exception as exc: # pylint: disable=broad-except if not ignore_errors: raise _logger.exception("Failed to parse content in file '%s': %s", file, str(exc)) continue ret[file] = keywords return ret