def word_cloud(words): all_words = [] for line in list(words): words = line.split() for word in words: all_words.append(word.lower()) # for line in words: # all_words.extend(line) # creates a word frequency dictionary word_freq = nltk.Counter(all_words) # draw a Word Cloud with word frequencies wordcloud = WordCloud( width=900, height=500, max_words=500, max_font_size=100, relative_scaling=0.5, colormap='Blues', normalize_plurals=True).generate_from_frequencies(word_freq) plt.figure(figsize=(17, 14)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('graphs/frequent_word_cloud.png') plt.show() plt.close('all')
def __count_parts_of_speech(self, docs): # https://medium.freecodecamp.org/an-introduction-to-part-of-speech-tagging-and-the-hidden-markov-model-953d45338f24 tag_types = ['NN', 'VB', 'JJ', 'RB'] pos_counts = [] for text in docs: tokens = nltk.word_tokenize(str(text).lower()) text = nltk.Text(tokens) tagged = nltk.pos_tag(text) counts = nltk.Counter(tag for word, tag in tagged) total = sum(counts.values()) my_dict = dict( (word, float(count) / total) for word, count in counts.items()) tag_counts = [] for x in tag_types: if my_dict.get(x) in tag_types: tag_counts.append(my_dict.get(x)) else: tag_counts.append(0.0) pos_counts.append(tag_counts) pos_counts_transpose = list(map(list, zip(*pos_counts))) self.__num_pos_count = len(pos_counts_transpose) return pos_counts_transpose
def get_TTR(post): post = post.lower() post = re.sub(r'\W', ' ', post) tokens = word_tokenize(post) types = nltk.Counter(tokens) ttr = len(types) / len(tokens) return ttr
def stem_tokens(tokens): stemmed = [] for item in tokens: # stemmed.append(nltk.PorterStemmer().stem(item)) stemmed.append(nltk.WordNetLemmatizer().lemmatize(item)) count = nltk.Counter(stemmed) # print(count.most_common(10)) return stemmed
def TTR(chunklist): score=0 for i in range(len(chunklist)): tokens=nltk.word_tokenize(chunklist[i]) types=nltk.Counter(tokens) score= score+(len(types)/len(tokens))*100 score=score/len(chunklist) return score
def ttr_calc(paragraph): A = preprocess_ttr(paragraph) types = nltk.Counter(A) try: TTR = (len(types) / len(A)) * 100 except ZeroDivisionError: TTR = 0 # TTR = (len(types)/len(A))*100 return TTR
def feature_extractor(words): features = { 'highest': 0, 'scored': 0, 'runs': 0, 'scorer': 0, 'score': 0, 'scored': 0, 'match': 0, 'wickets': 0, 'boundary': 0, 'fours': 0, '4s': 0, 'six': 0, 'sixes': 0, '6s': 0, '6': 0, 'hit': 0, 'four': 0, 'aggregate': 0, 'total': 0, 'team': 0, 'lead': 0, 'leading': 0, 'maximum': 0, 'max': 0, 'minimum': 0, 'min': 0, 'least': 0, 'less': 0, '1st': 0, '2nd': 0, '3rd': 0, '4th': 0, '5th': 0, '6th': 0, '7th': 0, '8th': 0, '9th': 0, '10th': 0, 'dot': 0, 'dots': 0, 'faced': 0, 'entire': 0, 'whole': 0, 'season': 0, 'strike': 0, 'rate': 0, 'strikerate': 0 } #tokenized_sentence = nltk.word_tokenize(sentence) word_counts = nltk.Counter(words) for word in word_counts: if word in features: features[word] = word_counts[word] return features
def run(self): with open(self.xml, encoding="utf-8") as fd: tree = xmltodict.parse(fd.read(), xml_attribs=False, force_list=True) document = getFullText(tree) doc_id = search(tree, "id")[0] text = clean_text(document) words = text_to_word_sequence(text) filtered_doc = [w.lower() for w in words if not w in self.stop_words and w != '' and w.isalpha() and len(w)>1] self.corpus[doc_id] = dict((i, j) for (i, j) in nltk.Counter(filtered_doc).items()) self.corpusWcount[doc_id] = filtered_doc
def count_noun(file_name): txt_file = open(file_name, 'r', encoding='utf-8') file_lines = txt_file.readlines() long_line = '' for line in file_lines: long_line += line long_token = nltk.tokenize.word_tokenize(long_line) #print(long_token) tags = nltk.pos_tag(long_token) count = nltk.Counter([j for i, j in tags if j.startswith('NN')]) #total_count = sum(j for i, j in tags if j.startswith('NN')) print(count)
def get_TTR(data): # Remove all special characters using regex data = re.sub(r'[^\w]', ' ', data) # Convert data to lowercase data = data.lower() # Tokenize the data to get word list tokens = nlp.word_tokenize(data) # Count all token and store in dictionary types = nlp.Counter(tokens) # Return Type-Token Ratio return (len(types) / len(tokens)) * 100
def lemmatisation(word): w_syn = wordnet.synsets(word) position = nltk.Counter() position["n"] = len([item for item in w_syn if item.pos() == "n"]) position["v"] = len([item for item in w_syn if item.pos() == "v"]) position["a"] = len([item for item in w_syn if item.pos() == "a"]) position["r"] = len([item for item in w_syn if item.pos() == "r"]) result = position.most_common(3) return result[0][0]
def analyze_text(text): adj_list = [] sentences = nltk.sent_tokenize(text) for sentence in sentences: for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))): if pos == 'JJ': adj_list.append(word) count = nltk.Counter(adj_list) return adj_list, count
def etape2_get_freq_word_and_stop_word(data, nb_mots_max=100): '''Fonction qui ressort un counter de mots les plus fréquent INPUT: ------ - data OUTPUT: ------- ###- counter : reprend les mots les plus sités - freq_words - stopword (english + spécifiques) PREREQUIS: ---------- - il faut que data ait la colonne 'Counter_WORD1' créée dans l'étape1 APPEL FONCTION: --------------- freq_words, sw = etape2(df2, nb_mots_max=10) ''' freq_totale = nltk.Counter() import datetime debut = datetime.datetime.now() print('Etape2 : DEB {}'.format(debut)) data_text = data['Counter_WORD1'] for m_id, token in data_text.iteritems(): freq_totale += nltk.FreqDist(token) most_freq = list(zip(*freq_totale.most_common(nb_mots_max)))[0] # #####return most_freq # On créé notre set de stopwords final qui cumule ainsi les 100 mots # les plus fréquents du corpus ainsi que l'ensemble de stopwords par défaut # présent dans la librairie NLTK # ##sw.update(tuple(nltk.corpus.stopwords.words('french'))) # ## Maintenant on peut fabriquer nos STOPWORD sw from nltk.corpus import stopwords nltk.data.path.append( r"/Users/seb/Workspace/Dev/Formation-OC/LIBRAIRIES/nltk_data") sw = set() # ######sw.update(tuple(nltk.corpus.stopwords.words('english'))) # sw1.update(stopwords.words('english')) sw.update(stopwords.words('english')) sw.update(most_freq) fin = datetime.datetime.now() print('Etape2 : FIN : {}'.format(fin-debut)) return freq_totale, sw return most_freq, sw
def count_ne(tagging): """ Count the number of occurrences of each named entity in a given tagging. :param tagging: Given tagging of sentence. :return: Dictionary structure of counts. """ if tagging: ne_counts = nltk.Counter(tagging) counts = {} for key in ne_counts: counts[key] = ne_counts[key] return counts else: return None
def get_tokens(): corpus = "" with open("out/nyt_articles.json") as data_file: data = json.load(data_file) for article in data: corpus += article['text'] lowers = corpus.lower() no_punctuation = lowers.translate(str.maketrans("", "", string.punctuation)) toker = nltk.RegexpTokenizer(r'\w+') tokens = toker.tokenize(no_punctuation) count = nltk.Counter(tokens) print(count.most_common(10)) return tokens
def index_pages(): sites = [ 'e-prostor.gov.si', 'e-uprava.gov.si', 'evem.gov.si', 'podatki.gov.si' ] data = [] allTokens = [] for site in sites: root = '../input-indexing/' + site + "/" (_, _, filenames) = next(walk(root)) html_files = [] for file in filenames: if file.endswith(".html"): html_files.append(file) for file in html_files: print("file = " + file) text = get_text(get_html_content(site, file)) tokens = retrieve_tokens(text) allTokens += tokens #print(tokens) #print(dist_words) pre = time.time() freq_table = {} indices = {} for i, token in enumerate(tokens): #https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3 if token in freq_table: freq_table[token] += 1 indices[token].append(str(i)) else: freq_table[token] = 1 indices[token] = [str(i)] #print("tokens = " + str(time.time() - pre)) for word, frequency in freq_table.items(): data.append((word, site + '/' + file, frequency, ','.join(indices[word]))) text_counts = nltk.Counter(allTokens) dist_words = set(text_counts) return [(token, ) for token in dist_words], data
def count_ne_normalized(tagging): """ Count the number of occurrences of each named entity in a given tagging. :param tagging: Given tagging of sentence. :return: Dictionary structure of counts. """ length = len(tagging) if tagging: ne_counts = nltk.Counter(tagging) normalized = {} for key in ne_counts: normalized[key] = float(ne_counts[key]) / length return normalized else: return None
def get_tokens(text): lowers = text.lower() # remove the punctuation using the character deletion step of translate remove_punctuation_map = dict( (ord(char), None) for char in string.punctuation) no_punctuation = lowers.translate(remove_punctuation_map) tokens = nltk.word_tokenize(no_punctuation) filtered = [w for w in tokens if not w in stopwords.words('english')] postagged = nltk.pos_tag(filtered) word_lemmatized = [ nltk.WordNetLemmatizer().lemmatize(postagged[idx][0], get_wordnet_pos(postagged[idx][1])) for idx in range(len(postagged)) ] count = nltk.Counter(word_lemmatized) return word_lemmatized, count
def main(): input_url = input("Enter the URL which you would like to analyze:") input_url += '/' check_url = validate_url(input_url) if check_url: web_text = crawl_web(input_url) web_text = re.sub('[!@#$():]', ' ', web_text) nlp = spacy.load('en_core_web_sm') parse_list = [] for sent in sent_tokenize(web_text): doc = nlp(sent) for chunk in doc.noun_chunks: parse_list.append(chunk.text) # Removal of stopwords updated_parse = remove_stopwords(parse_list) # Lematization updated_parse_new = lematize(updated_parse) dic = {} for word in updated_parse_new: if word in dic: dic[word] += 1 else: dic[word] = 1 counter_dic = nltk.Counter(dic) # print(counter_dic.most_common()) if len(counter_dic) < 500: for k, v in counter_dic.most_common(10): print(k) elif len(counter_dic) > 500 and len(counter_dic) < 750: for k, v in counter_dic.most_common(15): print(k) else: for k, v in counter_dic.most_common(20): print(k) else: print('Entered URL is not valid, please enter URL again') main()
prob_distArray.append(prob_dist_uni.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i]) i += 1 elep = ELEProbDist(freq_dist_uni) for s in elep.samples(): prob_distArray.append(elep.prob(s)) i = 0 for lim in freq_dist_uni.most_common(10): print(lim, prob_distArray[i], "\n") i += 1 uniqueWords = len(set(tokenized_text)) print("Unique Words: ", uniqueWords, "\n") bigram_count = bigrams(tokenized_text) counts = nltk.Counter(bigram_count) print("Bigram Count: ", counts, "\n", "Most Common 10 bigram: ", counts.most_common(10), "\n", "Least Common 3 words: ", counts.most_common()[-3:], "\n") word_mapping = dict( (w, w) if freq_dist_uni[w] > 1 else (w, 'UNK') for w in tokenized_text) print("less frequent unigrams to UNK: ", word_mapping) word_mapping = dict( (w, w) if counts[w] > 1 else (w, 'UNK') for w in tokenized_text) print("less frequent bigrams to UNK: ", word_mapping)
''' Obtain words within all titles and bodytexts ''' words = "" titles = dfs['title'].values bodies = dfs['body_text'].values for idx, title in enumerate(titles): if str(title) != 'nan': words += title for idx, body in enumerate(bodies): if str(body) != 'nan': words += body ## remove stop words text_tokens = word_tokenize(words.strip().lower()) tokens_without_sw = [word for word in text_tokens if not word in stopWords] counter_words = nltk.Counter(tokens_without_sw) tokens_without_sw_str = str(tokens_without_sw) ''' Word Cloud ''' data = WordCloud(background_color="white", max_words=200 ) data.generate(tokens_without_sw_str) data.to_file(CSV_FILE_DIR_HEAD + 'figure/wordcloud.eps')
# Cleaning the corpus # def clean_data(text): # text = str(text).lower() # text = re.sub('\[.*?\]', '', text) # text = re.sub('https?://\S+|www\.\S+', '', text) # text = re.sub('<.*?>+', '', text) # text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # text = re.sub('\n', '', text) # text = re.sub('\w*\d\w*', '', text) # return text # # # train['text'] = train['text'].apply(lambda x: clean_data(x)) # train['selected_text'] = train['selected_text'].apply(lambda x: clean_data(x)) # print(train.head()) # Trying to visualise the most common words in the selected text def remove_stopword(text): return [y for y in text if y not in stopwords.words('english')] train['temp_list'] = train['selected_text'].apply(lambda x: str(x).split()) train['temp_list'] = train['temp_list'].apply(lambda x: remove_stopword(x)) top = nltk.Counter( [item for sublist in train['temp_list'] for item in sublist]) temp = pd.DataFrame(top.most_common(20)) temp.columns = ['Common_words', 'count'] temp.style.background_gradient(cmap='Blues')
'abv', 'colour', 'country', 'description', 'grape_variety', 'name', 'Body' ] a_concat = pd.concat([a0[columns_sel], a1[columns_sel]]).reset_index() grapes = a_concat['grape_variety'].tolist() # in case we do not want to blend grapes grape = [ grape for x in grapes if len(x.split(',')) == 1 for grape in x.split(', ') ] fdist = nltk.FreqDist(grape) result = a_concat['grape_variety'] ## removing varieties that have only one member in the database counts = nltk.Counter(result) varieties = [key for key in counts if counts[key] > 40] data_input = a_concat[a_concat['grape_variety'].isin(varieties)].reset_index() # split the data into train and test combined_features = ['Body', 'description', 'grape_variety'] target = 'grape_variety' X_train, X_test, y_train, y_test = train_test_split( data_input[combined_features], data_input[target], test_size=0.33, random_state=42) # aggregate description by grape type grouped = X_train[['grape_variety',
""" import nltk as nlp import re import matplotlib.pyplot as plt #open the plaintext file file = open("The_Gift_of_The_Magi.txt") # Read the file into a string type variable called 'text' text = file.read() #Using Regex (Regular Expressions) remove all the special characters in the text. #This is done by selecting all special characters and substituting them with a space character. text = re.sub(r'[^\w]', ' ', text) # Tokenize the text and store it in a list called tokens tokens = nlp.word_tokenize(text) # Using the Counter function, obtatin the frequency of each token and store it as a 'key - value' # Pair in a dictionary called types types = nlp.Counter(tokens) # For Debugging purposes print(tokens) print(types) # Extract all the 'keys' from the dictionary types and store it as a list in X X = list(types.keys()) # Declare an empty list Y Y = [] #For Debugging purposes print(X) # For each item of key in X, find the corresponding value in the dictionary types and store it in list Y for key in X: Y.append(types[key]) # For Debugging purposes print(Y)
def get_news_features(headline, text): nlp = es_core_news_md.load() ## headline ## headline = re.sub(r"http\S+", "", headline) headline = re.sub(r"http", "", headline) headline = re.sub(r"@\S+", "", headline) headline = re.sub("\n", " ", headline) headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline) headline = headline.replace(r"*NUMBER*", "número") headline = headline.replace(r"*PHONE*", "número") headline = headline.replace(r"*EMAIL*", "email") headline = headline.replace(r"*URL*", "url") headline_lower = headline.lower() doc_h = nlp(headline_lower) list_tokens_h = [] list_tags_h = [] for sentence_h in doc_h.sents: for token in sentence_h: list_tokens_h.append(token.text) fdist_h = FreqDist(list_tokens_h) syllables_h = get_nsyllables(headline) words_h = len(list_tokens_h) # headline complexity features avg_word_size_h = round( sum(len(word) for word in list_tokens_h) / words_h, 2) avg_syllables_word_h = round(syllables_h / words_h, 2) unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2) mltd_h = round(ld.mtld(list_tokens_h), 2) ttr_h = round(ld.ttr(list_tokens_h) * 100, 2) ## text content## text = re.sub(r"http\S+", "", text) text = re.sub(r"http", "", text) text = re.sub("\n", " ", text) text = text.replace(r"*NUMBER*", "número") text = text.replace(r"*PHONE*", "número") text = text.replace(r"*EMAIL*", "email") text = text.replace(r"*URL*", "url") # to later calculate upper case letters ratio alph = list(filter(str.isalpha, text)) text_lower = text.lower() doc = nlp(text_lower) list_tokens = [] list_pos = [] list_tag = [] list_entities = [] sents = 0 for entity in doc.ents: list_entities.append(entity.label_) for sentence in doc.sents: sents += 1 for token in sentence: list_tokens.append(token.text) list_pos.append(token.pos_) list_tag.append(token.tag_) # Calculate entities, pos, tag, freq, syllables, words and quotes entities = len(list_entities) n_pos = nltk.Counter(list_pos) n_tag = nltk.Counter(list_tag) fdist = FreqDist(list_tokens) syllables = get_nsyllables(text) words = len(list_tokens) quotes = n_tag['PUNCT__PunctType=Quot'] # complexity features avg_word_sentence = round(words / sents, 2) avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2) avg_syllables_word = round(syllables / words, 2) unique_words = round((len(fdist.hapaxes()) / words) * 100, 2) ttr = round(ld.ttr(list_tokens) * 100, 2) # readability spanish test huerta_score = round( 206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2) szigriszt_score = round( 206.835 - ((62.3 * syllables) / words) - (words / sents), 2) # stylometric features mltd = round(ld.mtld(list_tokens), 2) upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2) entity_ratio = round((entities / words) * 100, 2) quotes_ratio = round((quotes / words) * 100, 2) propn_ratio = round((n_pos['PROPN'] / words) * 100, 2) noun_ratio = round((n_pos['NOUN'] / words) * 100, 2) pron_ratio = round((n_pos['PRON'] / words) * 100, 2) adp_ratio = round((n_pos['ADP'] / words) * 100, 2) det_ratio = round((n_pos['DET'] / words) * 100, 2) punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2) verb_ratio = round((n_pos['VERB'] / words) * 100, 2) adv_ratio = round((n_pos['ADV'] / words) * 100, 2) sym_ratio = round((n_tag['SYM'] / words) * 100, 2) # create df_features df_features = pd.DataFrame({ 'text': text_lower, 'headline': headline_lower, 'words_h': words_h, 'word_size_h': [avg_word_size_h], 'avg_syllables_word_h': [avg_syllables_word_h], 'unique_words_h': [unique_words_h], 'ttr_h': ttr_h, 'mltd_h': [mltd_h], 'sents': sents, 'words': words, 'avg_words_sent': [avg_word_sentence], 'avg_word_size': [avg_word_size], 'avg_syllables_word': avg_syllables_word, 'unique_words': [unique_words], 'ttr': [ttr], 'huerta_score': [huerta_score], 'szigriszt_score': [szigriszt_score], 'mltd': [mltd], 'upper_case_ratio': [upper_case_ratio], 'entity_ratio': [entity_ratio], 'quotes': quotes, 'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'pron_ratio': [pron_ratio], 'adp_ratio': [adp_ratio], 'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'verb_ratio': [verb_ratio], 'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio] }) return df_features
def get_ngrams(tokens): n_grams = [] for i in gram_target: items = nltk.ngrams(tokens, i) n_grams.append(nltk.Counter(items)) return n_grams
def caltf_idf_Word_Counter(Sentences): word_Counts = [] for s in Sentences: word_Counts.append(nltk.Counter(nltk.word_tokenize(s))) return word_Counts
def analyze(self, corpus, mode): """A 'corpus' here means a text file in either the keyword or country directory""" # Read the subcorpus, determining keywords and most common collocates with open(corpus, 'r', encoding='utf-8', errors='ignore') as f: subcorpus = json.load(f) name = os.path.basename(corpus) name = name.split('.')[0] subcorpus_keywords = defaultdict(int) subcorpus_adjectives = defaultdict(int) subcorpus_verbs = defaultdict(int) subcorpus_collocates = defaultdict(int) subcorpus_bigrams = defaultdict(int) subcorpus_hashtags = defaultdict(int) subcorpus_sentiment = defaultdict(int) clean_text = [] for tweet in subcorpus: self.total_tweets['total'] += 1 self.total_tweets[name] += 1 tweet_text = tweet['text'] for hashtag in tweet['hashtags']: subcorpus_hashtags[hashtag] += 1 tweet_text = tweet_text.replace(f'#{hashtag}', '') keywords, adjectives, verbs = self.find_keywords(tweet_text, name) for keyword, occurrences in keywords.items(): subcorpus_keywords[keyword] += occurrences for adj, occurrences in adjectives.items(): subcorpus_adjectives[adj] += occurrences for verb, occurrences in verbs.items(): subcorpus_verbs[verb] += occurrences if mode == 'country': bigrams = self.find_bigrams(tweet_text) for bigram, occurrences in bigrams.items(): subcorpus_bigrams[bigram] += occurrences elif mode == 'term': collocates = self.find_collocates(tweet_text, name) for collocate, occurrences in collocates.items(): subcorpus_collocates[collocate] += occurrences subcorpus_sentiment[tweet['sentiment']] += 1 clean_text.append(tweet_text) print(self.total_tweets['total']) """Create a bar chart of the keywords""" keyword_counter = nltk.Counter(subcorpus_keywords) labels = [] values = [] for keyword, count in keyword_counter.most_common(20): labels.append(keyword) values.append(count) graph_data_bar(labels, values, f'Most common keywords in the {name} subcorpus', 'Occurrences', 'Keyword') adj_counter = nltk.Counter(subcorpus_adjectives) labels = [] values = [] for adj, count in adj_counter.most_common(20): labels.append(adj) values.append(count) graph_data_bar(labels, values, f'Most common adjectives in the {name} subcorpus', 'Occurrences', 'Keyword') verb_counter = nltk.Counter(subcorpus_verbs) labels = [] values = [] for verb, count in verb_counter.most_common(20): labels.append(verb) values.append(count) graph_data_bar(labels, values, f'Most common verbs in the {name} subcorpus', 'Occurrences', 'Keyword') if mode == 'country': """Create a list/csv of the most common bigrams""" bigram_counter = nltk.Counter(subcorpus_bigrams) outputpath = os.path.join('..', 'data', 'csv_files') os.makedirs(outputpath, exist_ok=True) file = os.path.join(outputpath, f'{name}_bigrams.csv') if os.path.exists(file): os.remove(file) print(f'\nMost common bigrams in the {name} corpus:') with open(file, mode='w') as bigram_list: writer = csv.writer(bigram_list, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for n, (bigram, occurrences) in enumerate( bigram_counter.most_common(20)): print(f' {n + 1}. {bigram} ({occurrences} samples)') writer.writerow([bigram, occurrences]) input('\n') elif mode == 'term': """Create a bar graph of the most common collocates""" collocate_counter = nltk.Counter(subcorpus_collocates) labels = [] values = [] pre_labels = [] post_labels = [] pre_values = [] post_values = [] for (w1, w2), count in collocate_counter.most_common(20): if len(name.split()) == 2: # This collocate is for 'asylum seeker' if name.split()[1] == w1: # this is a post-word collocate labels.append(w2) values.append(count) post_labels.append(w2) post_values.append(count) elif name.split()[0] == w2: # this is a pre-word collocate labels.append(w1) values.append(count) pre_labels.append(w1) pre_values.append(count) elif name == w1: # this is a post-word collocate, as the search term is the first word in the bigram labels.append(w2) values.append(count) post_labels.append(w2) post_values.append(count) else: # this is a pre-word collocate, as the search term is the second word in the bigram labels.append(w1) values.append(count) pre_labels.append(w1) pre_values.append(count) graph_data_bar(labels, values, f'Most common collocates of {name}', 'Occurrences', 'Collocate') graph_data_bar(pre_labels, pre_values, f'Most common pre-word collocates of {name}', 'Occurrences', 'Collocate') graph_data_bar(post_labels, post_values, f'Most common post-word collocates of {name}', 'Occurrences', 'Collocate') """Create a list/csv of the hashtags""" hashtag_counter = nltk.Counter(subcorpus_hashtags) print(f'\nMost common hashtags in the {name} corpus:') outputpath = os.path.join('..', 'data', 'csv_files') os.makedirs(outputpath, exist_ok=True) file = os.path.join(outputpath, f'{name}_hashtags.csv') if os.path.exists(file): os.remove(file) with open(file, mode='w') as hashtag_list: writer = csv.writer(hashtag_list, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for n, (key, value) in enumerate(hashtag_counter.most_common(20)): print(f' {n + 1}. {key} ({value} samples)') writer.writerow([key, value]) input('\n') """"Create a pie chart of the sentiment distribution""" labels = ['positive', 'negative', 'neutral'] values = [ subcorpus_sentiment['positive'], subcorpus_sentiment['negative'], subcorpus_sentiment['neutral'] ] graph_data_pie(labels, values, f'Sentiment distribution in the {name} subcorpus') """Create clean text file for AntConc""" path = os.path.join(corpus, '..', 'clean') os.makedirs(path, exist_ok=True) outname = os.path.join(path, f'{name}.txt') with open(outname, 'w', encoding='utf-8') as outfile: for clean_tweet in clean_text: outfile.write(clean_tweet)
variety = {} for ii in range(len(result)): tmp = result.iloc[ii].lower() tmp = tmp.split(',') tmp = [re.sub(r'^ ', '', x) for x in tmp] tmp = [re.sub(r' $', '', x) for x in tmp] tmp = [shiraz_filter(samp) for samp in tmp] tmp = str(set(tmp)).replace("'", '').replace('{', '').replace('}', '') variety[ii] = tmp result = pd.Series(variety) #a['Variety']=result ## removing varieties that have only one member in the database counts = nltk.Counter(result) varieties = [key for key in counts if counts[key] > 30] #data_input = a[a['Variety'].isin(varieties)] data_input = a[a['grape_variety'].isin(varieties)].reset_index() ############################################ #defTags = ['CC', '.', ',', 'IN', ';', 'PRP', 'DT', 'MD', 'PDT', 'POS', 'TO', 'WDT', 'WP', 'WRB', 'NNP', 'RP'] #defTags = ['CC', '.', ',', 'IN', ';', 'DT', 'TO', 'CD'] defTags = ['NNS', 'NN', 'JJ', 'JJS', 'JJR'] #, 'RB', 'RBS', 'RBR']#, 'VBD', 'VBZ'] TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+' def clean_function(ll): list_to_remove = ["'", 's', '-']
def calculateTermFrequency(self): print("Calculating term frequency for all words.Please wait.") self.tf_list = nltk.Counter(self.word_list)