def get_transcription_data(): """ Function that returns medical transcription data This data was scraped from mtsamples.com data schema - description: Short description of transcription - medical_specialty: Medical specialty classification of transcription - sample_name: Transcription title - transcription: Sample medical transcriptions - keywords: Relevant keywords from transcription Parameters ---------- Returns medical_df : pandas data frame with transcription data. ------- """ try: log.info("preparing transcription data") data_path = os.path.join(os.path.dirname(__file__), 'data') file_name = data_path + "/mtsamples.csv" medical_df = pd.read_csv(file_name) medical_df = medical_df.dropna(axis=0, how='any') return medical_df except Exception as e: print(e.message, e.args) raise DataSourceError
def compute_coherence_values(self, limit=40, start=2, step=3): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ coherence_values = [] model_list = [] id2word = self.dictionary.id2token for num_topics in range(start, limit, step): model = LdaModel(corpus=self.corpus, id2word=id2word, num_topics=num_topics) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=self.docs, dictionary=self.dictionary, coherence='u_mass') log.info(coherencemodel.get_coherence()) coherence_values.append(coherencemodel.get_coherence()) return model_list, coherence_values
def download_tolstoy_novels(location='data/cache/'): """ Download Tolstoy novels from project Gutenberg These are placed in the cache folder - Anna Karenina - Boyhood - Childhood - The Cossacks - The Kreutzer Sonata - Youth ================== These are works we'd like to add to the collection : - Resurrection - The Death of Ivan Ilyich - Family Happiness - Hadji Murat :return: """ if not os.path.exists(location): try: os.mkdir(location) except OSError: log.info("Creation of the cache directory failed") else: log.info("Successfully created the cache directory ") items = dict() items["AnnaKarenina"] = "https://www.gutenberg.org/files/1399/1399-0.txt" items["Boyhood"] = "https://www.gutenberg.org/files/2450/2450-0.txt" items["Childhood"] = "https://www.gutenberg.org/files/2142/2142-0.txt" items["TheCossacks"] = "https://www.gutenberg.org/ebooks/4761.txt.utf-8" items["TheKreutzerSonata"] = "https://www.gutenberg.org/files/689/689-0.txt" items["Youth"] = "https://www.gutenberg.org/files/2637/2637-0.txt" items["WarAndPeace"] = "https://www.gutenberg.org/files/2600/2600-0.txt" for item in items: try: log.info(item) out_name = location + item + '.txt' if not os.path.exists(out_name): filename = wget.download(items[item], out =out_name) log.info("Dowloaded : " + filename) else: log.info("Found in cache : " + out_name) except Exception as e: log.error("problem with getting novel " + out_name) raise DataSourceError
def dependency_parse(self): """ Dependency parser : Analyzes the grammatical structure of a sentence, establishing relationships between "head" words and words which modify those heads :return: """ spacy.load("en_core_web_sm") parser = DependencyParser(self.nlp.vocab) doc = self.nlp(self.text) processed = parser(doc) log.info("Dependency Parsing : " + processed) return processed
def noun_chunks(self): """ Looks for n-gram noun phrases Think of noun chunks as a noun plus the words describing the noun – for example, “the lavish green grass” or “the world’s largest tech fund” :return: """ doc = self.nlp(self.text) result = list() for chunk in doc.noun_chunks: result.append(chunk.text) log.info(chunk.text + " " + chunk.root.text + " " + chunk.root.dep_ + " " + chunk.root.head.text) return result
def naiveBayesSentimentPredict(self,text): """ Predicts sentiment. Call naiveBayesSentimentFit on a corpus first otherwise an error will be thrown. :param text: :return predicted sentiment: """ if hasattr( self,"nb_classifier") is False: raise AttributeError test_data_features = {word.lower(): (word in word_tokenize(text.lower())) for word in self.nb_dict} result = self.nb_classifier.classify(test_data_features) log.info("Sentiment NB predict : " + result) return result
def fit(self): vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=self.num_features, stop_words='english') tfidf = vectorizer.fit_transform(self.docs[:self.samples]) nmf = NMF(n_components=self.num_topics, random_state=1).fit(tfidf) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(nmf.components_): log.info("Topic #%d:" % topic_idx) log.info(" ".join([ feature_names[i] for i in topic.argsort()[:-self.top_words - 1:-1] ]))
def valenceSentiment(self,text): """ Unsupervised sentiment analysis. :param text: :return sentiment polarity scores: """ sid = SentimentIntensityAnalyzer() for sentence in text: log.info(sentence) ss = sid.polarity_scores(sentence) for k in ss: print(k) print(ss[k]) log.info('Logging Sentiment : {0}: {1}, '.format(k, ss[k])) self.vader_polarity_scores.append(k) return self.vader_polarity_scores
def topic_coherence(self): if self.lda_model == None: self.fit() # Compute Coherence Score using c_v coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.docs, dictionary=self.dictionary, coherence='c_v') coherence_lda_CV = coherence_model_lda.get_coherence() log.info('\nCoherence Score CV method: ', coherence_lda_CV) # Compute Coherence Score using UMass coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.docs, dictionary=self.dictionary, coherence="u_mass") coherence_lda_umass = coherence_model_lda.get_coherence() log.info('\nCoherence Score: ', coherence_lda_umass) return coherence_lda_CV, coherence_lda_umass
def fit(self, dictionary_filter_extremes_no_below=10, dictionary_filter_extremes_no_above=0.2): # Perform function on our document self.docs_preprocessor() # Create Bigram & Trigram Models # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more. bigram = Phrases(self.docs, min_count=10) trigram = Phrases(bigram[self.docs]) for idx in range(len(self.docs)): for token in bigram[self.docs[idx]]: if '_' in token: # Token is a bigram, add to document. self.docs[idx].append(token) for token in trigram[self.docs[idx]]: if '_' in token: # Token is a bigram, add to document. self.docs[idx].append(token) # Remove rare & common tokens # Create a dictionary representation of the documents. self.dictionary = Dictionary(self.docs) self.dictionary.filter_extremes( no_below=dictionary_filter_extremes_no_below, no_above=dictionary_filter_extremes_no_above) # Create dictionary and corpus required for Topic Modeling self.corpus = [self.dictionary.doc2bow(doc) for doc in self.docs] log.info('Number of unique tokens: %d' % len(self.dictionary)) log.info('Number of documents: %d' % len(self.corpus)) log.info(self.corpus[:1]) # Make a index to word dictionary. temp = self.dictionary[0] # only to "load" the dictionary. id2word = self.dictionary.id2token lda_model = LdaModel(corpus=self.corpus, id2word=id2word, chunksize=self.chunksize, \ alpha='auto', eta='auto', \ iterations=self.iterations, num_topics=self.num_topics, \ passes=self.passes, eval_every=self.eval_every) # Print the Keyword in the 5 topics log.info(lda_model.print_topics()) self.lda_model = lda_model return self.lda_model
def test_tokenizer(self): # This will be the unit test test_text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome. The sky is pinkish-blue. You shouldn't eat cardboard""" text = Tokenizer.regexTokenize(test_text) t = Tokenizer(test_text) log.info(t.to_words()) log.info(t.to_sentences()) log.info(t.freqs()) t.plot_freqs() self.assertEqual(True, True)
def test_Lemmatize(self): # Get the nltk data we need datasets = [ 'stopwords', 'punkt', 'averaged_perceptron_tagger', 'wordnet' ] Utility.nltk_init(datasets) test_text = "This is the test text. Documents made up of words and/or phrases. \ The model consists of two tables; the first table is the probability of selecting \ a particular word in the corpus when sampling from a particular topic, and the second \ table is the probability of selecting a particular topic when sampling from a particular document." lem = Lemmatize() result = lem.lemmatize_nltk_with_POS(test_text) log.info(result) result = lem.lemmatize_spacy(test_text) log.info(result) result = lem.porter_stemmer(test_text) log.info(result) self.assertEqual(result[0:10], 'thi is the')
def test_sentiment(self): test_text_supervised = [ ("Great place to be when you are in Bangalore.", "pos"), ("The place was being renovated when I visited so the seating was limited.", "neg"), ("Loved the ambience, loved the food", "pos"), ("The food is delicious but not over the top.", "neg"), ("Service - Little slow, probably because too many people.", "neg"), ("The place is not easy to locate", "neg"), ("Mushroom fried rice was spicy", "pos"), ] test_text_unsupervised = [ "Great place to be when you are in Bangalore.", "The place was being renovated when I visited so the seating was limited.", "Loved the ambience, loved the food", "The food is delicious but not over the top.", "Service - Little slow, probably because too many people.", "The place is not easy to locate", "Mushroom fried rice was tasty" ] sent = Sentiment() sent.naiveBayesSentimentFit(test_text_supervised) test_data = "Manchurian was hot and spicy" nb_sentiment = sent.naiveBayesSentimentPredict(test_data) self.assertEqual(nb_sentiment, 'pos') log.info("Logging NB sentiment " + nb_sentiment) polarity_scores = sent.valenceSentiment(test_text_unsupervised) self.assertEqual(polarity_scores[0], 'neg') log.info("Logging polarity scores " + " ".join(polarity_scores)) log.info("done")
items["Childhood"] = "https://www.gutenberg.org/files/2142/2142-0.txt" items["TheCossacks"] = "https://www.gutenberg.org/ebooks/4761.txt.utf-8" items["TheKreutzerSonata"] = "https://www.gutenberg.org/files/689/689-0.txt" items["Youth"] = "https://www.gutenberg.org/files/2637/2637-0.txt" items["WarAndPeace"] = "https://www.gutenberg.org/files/2600/2600-0.txt" for item in items: try: log.info(item) out_name = location + item + '.txt' if not os.path.exists(out_name): filename = wget.download(items[item], out =out_name) log.info("Dowloaded : " + filename) else: log.info("Found in cache : " + out_name) except Exception as e: log.error("problem with getting novel " + out_name) raise DataSourceError if __name__ == '__main__': from pytma.tests.test_DataSources import test_novel_data, test_medical_data test_novel_data() test_medical_data() log.info("done")
Chart parse :return: """ nltk.parse.chart.demo(2, print_times=False, trace=1, sent=self.text, numparses=1) if __name__ == '__main__': # This will be the unit test test_text = u"It was now reading the sign that said Privet Drive — no, looking at the sign; " \ "cats couldn't read maps or signs.He didn't see the owls swooping past in broad daylight, " \ "though people down in the street did; they pointed and gazed open-mouthed as owl after " \ "owl sped overhead" pt = ParseText(test_text) noun_chunks = pt.noun_chunks() parsed = pt.dependency_parse() log.info("parse " + parsed) #TODO Broken #pt.chart_parse() #Can't be run in unit test pt.display_parse()
if __name__ == '__main__': #This will be the unit test test_text_supervised = [("Great place to be when you are in Bangalore.", "pos"), ("The place was being renovated when I visited so the seating was limited.", "neg"), ("Loved the ambience, loved the food", "pos"), ("The food is delicious but not over the top.", "neg"), ("Service - Little slow, probably because too many people.", "neg"), ("The place is not easy to locate", "neg"), ("Mushroom fried rice was spicy", "pos"), ] test_text_unsupervised = ["Great place to be when you are in Bangalore.", "The place was being renovated when I visited so the seating was limited.", "Loved the ambience, loved the food", "The food is delicious but not over the top.", "Service - Little slow, probably because too many people.", "The place is not easy to locate", "Mushroom fried rice was tasty"] sent= Sentiment() sent.naiveBayesSentimentFit(test_text_supervised) test_data = "Manchurian was hot and spicy" nb_sentiment =sent.naiveBayesSentimentPredict(test_data) polarity_scores = sent.valenceSentiment(test_text_unsupervised) log.info("Logging polarity scores "+ " ".join(polarity_scores ) ) log.info("done")
def test_Featureize(): # This will be the unit test test_text = "He received multiple nominations for Nobel Prize in Literature every year from 1902 to 1906, \ and nominations for Nobel Peace Prize in 1901, 1902 and 1910, and his miss of the prize is a major Nobel \ prize controversy.[3][4][5][6] Born to an aristocratic Russian family in 1828,[2] he is best known for \ the novels War and Peace (1869) and Anna Karenina (1877),[7] often cited as pinnacles of realist fiction. \ [2] He first achieved literary acclaim in his twenties with his semi-autobiographical trilogy, \ Childhood, Boyhood, and Youth (1852–1856), and Sevastopol Sketches (1855), based upon his experiences \ in the Crimean War." sw = StopWord(test_text.split()) swList = StopWord.SwLibEnum.spacy_sw text = sw.remove(swList) log.info(text) text = " ".join(text) feat = Featurize(text) text_tf = feat.tf() log.info(text_tf) log.info(len(test_text.split())) log.info(len(set(test_text.split()))) text_tf_idf = feat.tf_idf() log.info(text_tf_idf) text_vectors = feat.wtv_spacy() #feat.pca_wv(text_vectors) log.info("done")
return stopword_processed if __name__ == '__main__': # This will be the unit test test_text = "He received multiple nominations for Nobel Prize in Literature every year from 1902 to 1906, \ and nominations for Nobel Peace Prize in 1901, 1902 and 1910, and his miss of the prize is a major Nobel \ prize controversy.[3][4][5][6] Born to an aristocratic Russian family in 1828,[2] he is best known for \ the novels War and Peace (1869) and Anna Karenina (1877),[7] often cited as pinnacles of realist fiction. \ [2] He first achieved literary acclaim in his twenties with his semi-autobiographical trilogy, \ Childhood, Boyhood, and Youth (1852–1856), and Sevastopol Sketches (1855), based upon his experiences \ in the Crimean War." token = nltk.RegexpTokenizer(r'[a-zA-Z]+') word_tokens = token.tokenize(test_text) sw = StopWord(word_tokens) swList = StopWord.SwLibEnum.spacy_sw test_text_sw_removed_spacy = sw.remove(swList) log.info(test_text_sw_removed_spacy) swList = StopWord.SwLibEnum.scikit_sw test_text_sw_removed_scikit = sw.remove(swList) log.info(test_text_sw_removed_scikit) swList = StopWord.SwLibEnum.nltk_sw test_text_sw_removed_nltk = sw.remove(swList) log.info(test_text_sw_removed_nltk) log.info("done")
ignore_index=True) print(results) rfc = RandomForestClassifier() p, r, f = run_classifier(pd.DataFrame(unigram_corpus_array), rfc, 5) results = results.append( { 'Classifier': 'RandomForestClassifier', 'Corpus type': 'Bag of words - Unigram', 'mean_Precision': p, 'mean_Recall': r, 'mean_F1-score': f }, ignore_index=True) p, r, f = run_classifier(lda_corpus, rfc, 5) results = results.append( { 'Classifier': 'RandomForestClassifier', 'Corpus type': 'LDA Term Topics', 'mean_Precision': p, 'mean_Recall': r, 'mean_F1-score': f }, ignore_index=True) log.info(results) results.to_csv('doc_classification_results.txt', sep='\t') print('done')
def porter_stemmer(self, text): stem = PorterStemmer() split_text = text.split() stemmed_words = list() for word in split_text: stemmed = stem.stem(word) stemmed_words.append(stemmed) stemmed_text = " ".join(stemmed_words) return stemmed_text if __name__ == '__main__': # This will be the unit test test_text = "This is the test text. Documents made up of words and/or phrases. \ The model consists of two tables; the first table is the probability of selecting \ a particular word in the corpus when sampling from a particular topic, and the second \ table is the probability of selecting a particular topic when sampling from a particular document." lem = Lemmatize() result = lem.lemmatize_nltk_with_POS(test_text) log.info(result) result = lem.lemmatize_spacy(test_text) log.info(result) result = lem.porter_stemmer(test_text) log.info(result)
# Use LDA to preprocess - later make a base class and refactor. lda = LDAAnalysis(docs) lda.docs_preprocessor() docs = lda.docs pickle_LDAAnalysis = open("data/cache/LDAAnalysisPreprocessed.pkl", "wb") pickle.dump(lda, pickle_LDAAnalysis) pickle_LDAAnalysis.close() else: with open("data/cache/LDAAnalysisPreprocessed.pkl", 'rb') as pickle_file: lda = pickle.load(pickle_file) docs = lda.docs with open("data/cache/LDAAnalysis.pkl", 'rb') as pickle_file: lda = pickle.load(pickle_file) docs = lda.docs dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=10, no_above=0.2) corpus = [dictionary.doc2bow(doc) for doc in docs] log.info('Number of unique tokens: %d' % len(dictionary)) log.info('Number of documents: %d' % len(corpus)) log.info(corpus[:1]) temp = dictionary[0] # only to "load" the dictionary. id2word = dictionary.id2token corpus = lda.corpus ctm = CTMModel(corpus, num_topics=25, id2word=id2word) print("done")