def get_all_licenses_and_keywords(self, conf_path, text_path): ''' Get a list of licenses and keywords to be searched. keywords is a dictionary in form { 'keyword' : [ its locations ] } ''' words_occurrences = defaultdict(lambda: set()) words_frequencies = defaultdict(lambda: 0) licenses = self._load_licenses_from_configs(helper.get_files(conf_path)) unconfigured_licenses = list( \ set(helper.get_files(text_path)).difference(\ set(itertools.chain(*[license.files for license in licenses])))) licenses += self._load_licenses_from_texts(unconfigured_licenses) for lic in licenses: for f in lic.cachedfiles: words = SingleLicenseLoader.get_words_from_license(f) self._merge_frequency_dicts(words_frequencies, words) for word in words.keys(): words_occurrences[word].add(f) keywords = dict((w, words_occurrences[w]) for w \ in self._select_keywords(words_frequencies, words_occurrences)) return licenses, keywords
def load_directory(cls, path): '''Get files from a directory recursively :param path: Path to the directory ''' filelist = set() if not os.path.isdir(path): raise IOError("{} is not a directory!".format(path)) for filename in helper.get_files(path): logger.debug('Loading {}'.format(filename)) if os.path.isdir(filename): filelist |= cls.load_directory(filename) elif cls.is_archive(filename): filelist |= cls.load_archive(filename) elif os.path.isfile(filename): filelist.add(cls.load_file(filename)) else: logger.error('File does not exist or format not supported: {}'.format(filename)) return filelist