def get_corpus_info(categories, **kwargs): corpus_info = [] for category in categories: if not 'stopwords' in kwargs: tokens = [w for w in brown.words(categories=category)] else: if kwargs['stopwords'] == 'english': tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')] token_count = len(tokens) type_count = len(set(tokens)) diversity = "%.3f" % (type_count/token_count) tmp = category.split('_') category = ' '.join(tmp) category_info = (category, token_count, type_count, diversity) print(category_info) corpus_info.append(category_info) return corpus_info
if not 'stopwords' in kwargs: tokens = [w for w in brown.words(categories=category)] else: if kwargs['stopwords'] == 'english': tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')] token_count = len(tokens) type_count = len(set(tokens)) diversity = "%.3f" % (type_count/token_count) tmp = category.split('_') category = ' '.join(tmp) category_info = (category, token_count, type_count, diversity) print(category_info) corpus_info.append(category_info) return corpus_info if __name__ == "__main__": <<<<<<< HEAD ======= plurals_info = get_plurals_info(brown.categories()) FileOps.write_plurals_table(plurals_info) >>>>>>> a4d8d09f16fdcb0d863e6b8683049a38be7be961 corpus_info = get_corpus_info(brown.categories()) stopwords_info = get_corpus_info(brown.categories(), stopwords='english') FileOps.write_table(corpus_info, stopwords_info) print_corpus_info(brown.categories(), FileOps.get_stopwords('stopwords.txt'))