Exemple #1
0
def get_corpus_info(categories, **kwargs):
    
    corpus_info = []
    
    for category in categories:
        if not 'stopwords' in kwargs:
            tokens = [w for w in brown.words(categories=category)]
        else:
            if kwargs['stopwords'] == 'english':
                tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')]
        token_count = len(tokens)
        type_count = len(set(tokens))
        diversity = "%.3f" % (type_count/token_count)
        tmp = category.split('_')
        category = ' '.join(tmp)
        category_info = (category, token_count, type_count, diversity)
        print(category_info)
        corpus_info.append(category_info)
        
    return corpus_info
Exemple #2
0
        if not 'stopwords' in kwargs:
            tokens = [w for w in brown.words(categories=category)]
        else:
            if kwargs['stopwords'] == 'english':
                tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')]
        token_count = len(tokens)
        type_count = len(set(tokens))
        diversity = "%.3f" % (type_count/token_count)
        tmp = category.split('_')
        category = ' '.join(tmp)
        category_info = (category, token_count, type_count, diversity)
        print(category_info)
        corpus_info.append(category_info)
        
    return corpus_info
    
if __name__ == "__main__":
    
<<<<<<< HEAD
=======
    
    plurals_info = get_plurals_info(brown.categories())
    FileOps.write_plurals_table(plurals_info)
    
    
>>>>>>> a4d8d09f16fdcb0d863e6b8683049a38be7be961
    corpus_info = get_corpus_info(brown.categories())
    stopwords_info = get_corpus_info(brown.categories(), stopwords='english')
    FileOps.write_table(corpus_info, stopwords_info)
    print_corpus_info(brown.categories(), FileOps.get_stopwords('stopwords.txt'))