from collections import Counter from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() from nlpia.data.loaders import kite_text tokens = tokenizer.tokenize(kite_text.lower()) token_counts = Counter(tokens) token_counts # remove common stopwords import nltk nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('english') tokens = [x for x in tokens if x not in stopwords] kite_counts = Counter(tokens) kite_counts kite_counts.most_common(10)
from nlpia.data.loaders import kite_text, kite_history from nltk.tokenize import TreebankWordTokenizer from collections import Counter tokenizer = TreebankWordTokenizer() #tworzymy korpus składający się z dwóch tekstów poświęconych latawcom kite_intro = kite_text.lower() kite_history = kite_history.lower() #obydwa dokumenty w korpusie dzielimy na tokeny kite_intro_tokens = tokenizer.tokenize(kite_intro) kite_history_tokens = tokenizer.tokenize(kite_history) #zliczamy poszczególne słowa intro_counts = Counter(kite_intro_tokens) history_counts = Counter(kite_history_tokens) #długości wektorów intro_tokens_total = len(kite_intro_tokens) history_tokens_total = len(kite_history_tokens) intro_tf = {} history_tf = {} #obliczamy TF słowa 'kite' dla każdego z dokumentów intro_tf['kite'] = intro_counts['kite'] / intro_tokens_total history_tf['kite'] = history_counts['kite'] / history_tokens_total #obliczamy TF słowa 'and' dla każdego z dokumentów intro_tf['and'] = intro_counts['and'] / intro_tokens_total history_tf['and'] = history_counts['and'] / history_tokens_total