def adj_noun_phrase(text_description): """Gives adjective-noun phrases from the text and their frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) matcher = Matcher(nlp.vocab) doc = nlp(str(text_description).lower()) pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}] matcher.add('ADJ_NOUN_PATTERN', None, pattern) matches = matcher(doc) print("Total matches found", len(matches)) adj_noun_phrases = [doc[start:end].text for match_id, start, end in matches] create_csv("total_adj_noun.csv", "Total_adj_noun", len(adj_noun_phrases)) create_csv_list("adj_noun_phrase_list.csv", "Adj_noun_phrase_list", adj_noun_phrases) adj_noun_phrase_frequency = Counter(adj_noun_phrases) create_csv_dictionary("adj_noun_frequency.csv", "Adj_noun_phrases", "Frequency", adj_noun_phrase_frequency) favorite_adj_noun_phrase = max(adj_noun_phrase_frequency, key=adj_noun_phrase_frequency.get) create_csv("favorite_adj_noun.csv", "Favorite_adj_noun_phrase", favorite_adj_noun_phrase) return {'adj_noun_phrases': adj_noun_phrases, 'adj_noun_phrase_count': len(adj_noun_phrases), 'adj_noun_phrase_frequency': adj_noun_phrase_frequency, 'favorite_adj_noun_phrase': favorite_adj_noun_phrase}
def noun_adj_phrase(text_description): """Gives Noun-adjective phrases from the text and their frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) matcher = Matcher(nlp.vocab) doc = nlp(str(text_description).lower()) try: pattern = [{'POS': 'NOUN'}, {'POS': 'ADJ'}] matcher.add('NOUN_ADJ_PATTERN', None, pattern) matches = matcher(doc) print("Total matches found:", len(matches)) noun_adj_phrases = [doc[start:end].text for match_id, start, end in matches] create_csv("total_noun_adj.csv", "Total_noun_adj", len(noun_adj_phrases)) create_csv_list("noun_adj_phrase_list.csv", "Noun_adj_phrase_list", noun_adj_phrases) noun_adj_phrase_frequency = Counter(noun_adj_phrases) create_csv_dictionary("noun_adj_frequency.csv", "Noun_adj_phrases", "Frequency", noun_adj_phrase_frequency) favorite_noun_adj_phrase = max(noun_adj_phrase_frequency, key=noun_adj_phrase_frequency.get) create_csv("favorite_noun_adj.csv", "Favorite_noun_adj_phrase", favorite_noun_adj_phrase) return {'noun_adj_phrases': noun_adj_phrases, 'noun_adj_phrase_count': len(noun_adj_phrases), 'noun_adj_phrase_frequency': noun_adj_phrase_frequency, 'favorite_noun_adj_phrase': favorite_noun_adj_phrase} except Exception: pass
def words_without_stopwords(text_description): """Words without stop_words, spaces, punctuations""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) doc = nlp(str(text_description).lower()) # Expected is string type object but text is list and converting into lowercase try: text_without_stopwords = [str(token) for token in doc if token.is_stop is False if token.is_punct is False \ if token.is_space is False] # Token is a spacy.tokens.doc.Doc object and cannot be Json serialized, we need to convert it into string again create_csv_list("text_without_stopwords.csv", "text_without_stopwords", text_without_stopwords) return {'text_without_stopwords': text_without_stopwords} except Exception: return {'message': 'No words'}
def total_nouns(text_description): """Noun words, total no of nouns, nouns frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) doc = nlp(str(text_description).lower()) nouns = [token.text for token in doc if token.pos_ == 'NOUN'] create_csv_list("noun_list.csv", "Noun_list", nouns) create_csv("total_no_of_noun.csv", "Total_noun", len(nouns)) noun_frequency = Counter(nouns) create_csv_dictionary("noun_frequency.csv", "nouns", "frequency", noun_frequency) favorite_noun = max(noun_frequency, key=noun_frequency.get) create_csv("favorite_noun.csv", "favorite_noun", favorite_noun) return {'nouns': nouns, 'noun_count': len(nouns), 'noun_frequency': noun_frequency, 'favorite_noun': favorite_noun}
def total_verbs(text_description): """Verbs, total no of verbs, verb frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) doc = nlp(str(text_description).lower()) verbs = [token.text for token in doc if token.pos_ == 'VERB'] create_csv("total_no_of_verbs.csv", "Total_no_of_verbs", len(verbs)) create_csv_list("verb_list.csv", "Verb_list", verbs) verb_frequency = Counter(verbs) create_csv_dictionary("verb_frequency.csv", "Verbs", "Frequency", verb_frequency) favorite_verb = max(verb_frequency, key=verb_frequency.get) create_csv("favorite_verb.csv", "Favorite_verbs", favorite_verb) return {'verbs': verbs, 'verb_count': len(verbs), 'verb_frequency': verb_frequency, 'favorite_verb': favorite_verb}
def noun_noun_phrase(text_description): """Noun-Noun phrases and their frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) doc = nlp(str(text_description).lower()) noun_noun_phrases = [chunk.text for chunk in doc.noun_chunks] create_csv("total_noun_noun_phrase.csv", "Noun_noun_phrases", len(noun_noun_phrases)) create_csv_list("noun_noun_phrase_list.csv", "Noun_noun_phrase_list", noun_noun_phrases) noun_noun_phrase_frequency = Counter(noun_noun_phrases) create_csv_dictionary("noun_noun_phrase_frequency.csv", "Noun_Noun_phrase", "Frequency", noun_noun_phrase_frequency) favorite_noun_noun = max(noun_noun_phrase_frequency, key=noun_noun_phrase_frequency.get) create_csv("favorite_noun_noun.csv", "Favorite_noun_noun_phrase", favorite_noun_noun) return {'noun_noun_phrases': noun_noun_phrases, 'noun_noun_phrase_count': len(noun_noun_phrases), 'noun_noun_phrase_frequency': noun_noun_phrase_frequency, 'favorite_noun_noun_phrase': favorite_noun_noun}
def person_names(text_description): """Person names and their frequencies""" nlp.pipe(text_description, disable=["tokenizer", "tagger", "parser", "ner", "textcat", "..."]) doc = nlp(str(text_description).lower()) names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"] create_csv("total_no_of_names.csv", "Total_no_of_person_names", len(names)) create_csv_list("name_list.csv", "Person_name_list", names) name_frequency = Counter(names) create_csv_dictionary("name_frequency.csv", "Person_name", "Frequency", name_frequency) try: # Person name with maximum frequency favorite_name = max(name_frequency, key=name_frequency.get) return {'person_names': names, 'person_name_frequency': name_frequency, 'favorite_person_name': favorite_name} except Exception: return {'message': 'No favorite person name'}
def total_adjectives(text_description): """Adjectives, total no of adjectives, adjective frequencies""" nlp.pipe(text_description, disable=['tokenizer', 'tagger', 'parser', 'ner', 'textcat', '...']) print(nlp.pipe_names) doc = nlp(str(text_description).lower()) adjectives = [token.text for token in doc if token.pos_ == 'ADJ'] create_csv("total_no_of_adj.csv", "Total_no_of_adjectives", len(adjectives)) create_csv_list("adjective_lists.csv", "Adjective_list", adjectives) adj_frequency = Counter(adjectives) create_csv_dictionary("adj_frequency.csv", "Adjectives", "Frequency", adj_frequency) favorite_adjective = max(adj_frequency, key=adj_frequency.get) create_csv("favorite_adjective.csv", "Favorite_adjective", favorite_adjective) list_of_tuples = sorted(adj_frequency.items(), reverse=True, key=lambda x: x[1]) top_ten_adjectives = list_of_tuples[:10] create_csv_list("top_10_adj.csv", "Top_ten_adjective_list", top_ten_adjectives) sents = doc.sents sentences_with_adj = [] """Finding the average number of adjectives in each sentences""" # finding the sentences with adjectives for sent in sents: for token in sent: if token.pos_ == "ADJ": sentences_with_adj.append(sent) break # finding the number of adjectives in sentences from sentences_with_adjectives sentences_and_adj_count = Counter(sentences_with_adj) # counting the average number of adjectives in each sentences count = 0 sum = 0 for key in sentences_and_adj_count: count += 1 sum += sentences_and_adj_count[key] avg_in_sentences = sum / count """Average number of adjectives in each paragraph""" # Splitting the text into paragraphs start = 0 paragraph_list, paragraphs_with_adjectives = [], [] for token in doc: if token.is_space and token.text.count("\n") > 1: paragraph_list.append(doc[start:token.i]) start = token.i for para in paragraph_list: for token in para: if token.pos_ == "ADJ": paragraphs_with_adjectives.append(para) # Counting the total number of adjectives in each paragraphs paragraphs_and_adj_count = Counter(paragraphs_with_adjectives) # Counting the average number of adjectives in each paragraphs for key in paragraphs_and_adj_count: count += 1 sum = paragraphs_and_adj_count[key] avg_in_paragraphs = sum / count return {'adjectives': adjectives, 'adj_count': len(adjectives), 'adj_frequency': adj_frequency, 'favorite_adjective': favorite_adjective, 'top_ten_adjectives': top_ten_adjectives, 'average_adj_in_sentences': avg_in_sentences, 'average_adj_in_paragraphs': avg_in_paragraphs}
future_tense_sentences.append(sent) break return present_tense_sentences, past_tense_sentences, future_tense_sentences if __name__ == '__main__': stop_words = list(STOP_WORDS) text_data = open("Data/2600-0.txt", "r", encoding="utf-8").read().lower()[:90000] nlp = spacy.load("en_core_web_sm") # loading the model with all pipelines # text_doc = nlp(text_data) create_csv_list("clean_text.csv", "Words without stop_words, spaces, punctuations", clean_text(text_data)) noun_count, noun_list, noun_frequency, ten_noun_words = total_nouns( text_data) create_csv("noun_count.csv", "Total number of nouns", noun_count) create_csv_list("noun_list.csv", "list of nouns", noun_list) create_csv_list("top_ten_noun_frequency.csv", "Favorite Ten Nouns", ten_noun_words) create_csv_dictionary("noun_frequency.csv", "Noun Frequencies", "Nouns", "Frequencies", noun_frequency) adj_count, adj_list, adj_frequency, ten_adj_words, average_per_sentences, average_per_paragraphs, favorite_adjective = total_adjectives( text_data) create_csv("adj_count.csv", "Total number of adjectives", adj_count) create_csv("favorite_adj.csv", "Tolstoy's favorite adjective",