def get_transcription_data():
    """
        Function that returns medical transcription data
        This data was scraped from mtsamples.com
        data schema
            - description: Short description of transcription
            - medical_specialty: Medical specialty classification of transcription
            - sample_name: Transcription title
            - transcription: Sample medical transcriptions
            - keywords: Relevant keywords from transcription

        Parameters
        ----------

        Returns
        medical_df : pandas data frame with transcription data.
        -------
        """
    try:
        log.info("preparing transcription data")
        data_path = os.path.join(os.path.dirname(__file__), 'data')
        file_name = data_path + "/mtsamples.csv"
        medical_df = pd.read_csv(file_name)
        medical_df = medical_df.dropna(axis=0, how='any')
        return medical_df
    except Exception as e:
        print(e.message, e.args)
        raise DataSourceError
    def compute_coherence_values(self, limit=40, start=2, step=3):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """

        coherence_values = []
        model_list = []
        id2word = self.dictionary.id2token
        for num_topics in range(start, limit, step):
            model = LdaModel(corpus=self.corpus,
                             id2word=id2word,
                             num_topics=num_topics)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model,
                                            texts=self.docs,
                                            dictionary=self.dictionary,
                                            coherence='u_mass')
            log.info(coherencemodel.get_coherence())
            coherence_values.append(coherencemodel.get_coherence())

        return model_list, coherence_values
def download_tolstoy_novels(location='data/cache/'):
    """
    Download Tolstoy novels from project Gutenberg
    These are placed in the cache folder

        - Anna Karenina
        - Boyhood
        - Childhood
        - The Cossacks
        - The Kreutzer Sonata
        - Youth

    ==================

    These are works we'd like to add to the collection :

        - Resurrection
        - The Death of Ivan Ilyich
        - Family Happiness
        - Hadji Murat

    :return:
    """
    if not os.path.exists(location):
        try:
            os.mkdir(location)
        except OSError:
            log.info("Creation of the cache directory failed")
        else:
            log.info("Successfully created the cache directory ")

    items = dict()

    items["AnnaKarenina"] = "https://www.gutenberg.org/files/1399/1399-0.txt"
    items["Boyhood"] = "https://www.gutenberg.org/files/2450/2450-0.txt"
    items["Childhood"] = "https://www.gutenberg.org/files/2142/2142-0.txt"
    items["TheCossacks"] = "https://www.gutenberg.org/ebooks/4761.txt.utf-8"
    items["TheKreutzerSonata"] = "https://www.gutenberg.org/files/689/689-0.txt"
    items["Youth"] = "https://www.gutenberg.org/files/2637/2637-0.txt"
    items["WarAndPeace"] = "https://www.gutenberg.org/files/2600/2600-0.txt"

    for item in items:
        try:
            log.info(item)
            out_name = location + item + '.txt'
            if not os.path.exists(out_name):
                filename = wget.download(items[item], out =out_name)
                log.info("Dowloaded : " + filename)

            else:
                log.info("Found in cache : " + out_name)

        except Exception as e:
            log.error("problem with getting novel " + out_name)
            raise DataSourceError
Example #4
0
 def dependency_parse(self):
     """
     Dependency parser : Analyzes the grammatical structure of a sentence,
     establishing relationships between "head" words and words which modify those heads
     :return:
     """
     spacy.load("en_core_web_sm")
     parser = DependencyParser(self.nlp.vocab)
     doc = self.nlp(self.text)
     processed = parser(doc)
     log.info("Dependency Parsing : " + processed)
     return processed
Example #5
0
 def noun_chunks(self):
     """
     Looks for n-gram noun phrases
     Think of noun chunks as a noun plus the words describing the noun –
     for example, “the lavish green grass” or “the world’s largest tech fund”
     :return:
     """
     doc = self.nlp(self.text)
     result = list()
     for chunk in doc.noun_chunks:
         result.append(chunk.text)
         log.info(chunk.text + " " + chunk.root.text + " " +
                  chunk.root.dep_ + " " + chunk.root.head.text)
     return result
Example #6
0
    def naiveBayesSentimentPredict(self,text):
        """
        Predicts sentiment.  Call naiveBayesSentimentFit on a corpus first otherwise
        an error will be thrown.

        :param text:
        :return predicted sentiment:
        """
        if hasattr( self,"nb_classifier") is False:
            raise AttributeError

        test_data_features = {word.lower(): (word in word_tokenize(text.lower())) for word in self.nb_dict}
        result = self.nb_classifier.classify(test_data_features)
        log.info("Sentiment NB predict : " + result)

        return result
    def fit(self):
        vectorizer = TfidfVectorizer(max_df=0.95,
                                     min_df=2,
                                     max_features=self.num_features,
                                     stop_words='english')
        tfidf = vectorizer.fit_transform(self.docs[:self.samples])

        nmf = NMF(n_components=self.num_topics, random_state=1).fit(tfidf)

        feature_names = vectorizer.get_feature_names()

        for topic_idx, topic in enumerate(nmf.components_):
            log.info("Topic #%d:" % topic_idx)
            log.info(" ".join([
                feature_names[i]
                for i in topic.argsort()[:-self.top_words - 1:-1]
            ]))
Example #8
0
    def valenceSentiment(self,text):
        """
        Unsupervised sentiment analysis.

        :param text:
        :return sentiment polarity scores:
        """
        sid = SentimentIntensityAnalyzer()
        for sentence in text:
            log.info(sentence)
            ss = sid.polarity_scores(sentence)
            for k in ss:
                print(k)
                print(ss[k])
                log.info('Logging Sentiment : {0}: {1}, '.format(k, ss[k]))
                self.vader_polarity_scores.append(k)
        return self.vader_polarity_scores
    def topic_coherence(self):
        if self.lda_model == None:
            self.fit()

        # Compute Coherence Score using c_v
        coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=self.docs,
                                             dictionary=self.dictionary,
                                             coherence='c_v')
        coherence_lda_CV = coherence_model_lda.get_coherence()
        log.info('\nCoherence Score CV method: ', coherence_lda_CV)

        # Compute Coherence Score using UMass
        coherence_model_lda = CoherenceModel(model=self.lda_model,
                                             texts=self.docs,
                                             dictionary=self.dictionary,
                                             coherence="u_mass")
        coherence_lda_umass = coherence_model_lda.get_coherence()
        log.info('\nCoherence Score: ', coherence_lda_umass)

        return coherence_lda_CV, coherence_lda_umass
    def fit(self,
            dictionary_filter_extremes_no_below=10,
            dictionary_filter_extremes_no_above=0.2):
        # Perform function on our document
        self.docs_preprocessor()
        # Create Bigram & Trigram Models

        # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
        bigram = Phrases(self.docs, min_count=10)
        trigram = Phrases(bigram[self.docs])

        for idx in range(len(self.docs)):
            for token in bigram[self.docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    self.docs[idx].append(token)
            for token in trigram[self.docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    self.docs[idx].append(token)
        # Remove rare & common tokens
        # Create a dictionary representation of the documents.
        self.dictionary = Dictionary(self.docs)
        self.dictionary.filter_extremes(
            no_below=dictionary_filter_extremes_no_below,
            no_above=dictionary_filter_extremes_no_above)
        # Create dictionary and corpus required for Topic Modeling
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.docs]
        log.info('Number of unique tokens: %d' % len(self.dictionary))
        log.info('Number of documents: %d' % len(self.corpus))
        log.info(self.corpus[:1])

        # Make a index to word dictionary.
        temp = self.dictionary[0]  # only to "load" the dictionary.

        id2word = self.dictionary.id2token

        lda_model = LdaModel(corpus=self.corpus, id2word=id2word, chunksize=self.chunksize, \
                             alpha='auto', eta='auto', \
                             iterations=self.iterations, num_topics=self.num_topics, \
                             passes=self.passes, eval_every=self.eval_every)
        # Print the Keyword in the 5 topics
        log.info(lda_model.print_topics())
        self.lda_model = lda_model
        return self.lda_model
    def test_tokenizer(self):
        # This will be the unit test

        test_text = """Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
            The sky is pinkish-blue. You shouldn't eat cardboard"""

        text = Tokenizer.regexTokenize(test_text)

        t = Tokenizer(test_text)
        log.info(t.to_words())
        log.info(t.to_sentences())
        log.info(t.freqs())
        t.plot_freqs()

        self.assertEqual(True, True)
    def test_Lemmatize(self):
        # Get the nltk data we need
        datasets = [
            'stopwords', 'punkt', 'averaged_perceptron_tagger', 'wordnet'
        ]
        Utility.nltk_init(datasets)

        test_text = "This is the test text. Documents made up of words and/or phrases. \
            The model consists of two tables; the first table is the probability of selecting  \
            a particular word in the corpus when sampling from a particular topic, and the second \
            table is the probability of selecting a particular topic when sampling from a particular document."

        lem = Lemmatize()

        result = lem.lemmatize_nltk_with_POS(test_text)
        log.info(result)

        result = lem.lemmatize_spacy(test_text)
        log.info(result)

        result = lem.porter_stemmer(test_text)
        log.info(result)

        self.assertEqual(result[0:10], 'thi is the')
    def test_sentiment(self):
        test_text_supervised = [
            ("Great place to be when you are in Bangalore.", "pos"),
            ("The place was being renovated when I visited so the seating was limited.",
             "neg"),
            ("Loved the ambience, loved the food", "pos"),
            ("The food is delicious but not over the top.", "neg"),
            ("Service - Little slow, probably because too many people.",
             "neg"),
            ("The place is not easy to locate", "neg"),
            ("Mushroom fried rice was spicy", "pos"),
        ]

        test_text_unsupervised = [
            "Great place to be when you are in Bangalore.",
            "The place was being renovated when I visited so the seating was limited.",
            "Loved the ambience, loved the food",
            "The food is delicious but not over the top.",
            "Service - Little slow, probably because too many people.",
            "The place is not easy to locate", "Mushroom fried rice was tasty"
        ]

        sent = Sentiment()

        sent.naiveBayesSentimentFit(test_text_supervised)

        test_data = "Manchurian was hot and spicy"
        nb_sentiment = sent.naiveBayesSentimentPredict(test_data)
        self.assertEqual(nb_sentiment, 'pos')
        log.info("Logging NB sentiment " + nb_sentiment)

        polarity_scores = sent.valenceSentiment(test_text_unsupervised)
        self.assertEqual(polarity_scores[0], 'neg')

        log.info("Logging polarity scores " + " ".join(polarity_scores))
        log.info("done")
    items["Childhood"] = "https://www.gutenberg.org/files/2142/2142-0.txt"
    items["TheCossacks"] = "https://www.gutenberg.org/ebooks/4761.txt.utf-8"
    items["TheKreutzerSonata"] = "https://www.gutenberg.org/files/689/689-0.txt"
    items["Youth"] = "https://www.gutenberg.org/files/2637/2637-0.txt"
    items["WarAndPeace"] = "https://www.gutenberg.org/files/2600/2600-0.txt"

    for item in items:
        try:
            log.info(item)
            out_name = location + item + '.txt'
            if not os.path.exists(out_name):
                filename = wget.download(items[item], out =out_name)
                log.info("Dowloaded : " + filename)

            else:
                log.info("Found in cache : " + out_name)

        except Exception as e:
            log.error("problem with getting novel " + out_name)
            raise DataSourceError

if __name__ == '__main__':

    from pytma.tests.test_DataSources import test_novel_data, test_medical_data

    test_novel_data()

    test_medical_data()

    log.info("done")
Example #15
0
        Chart parse
        :return:
        """
        nltk.parse.chart.demo(2,
                              print_times=False,
                              trace=1,
                              sent=self.text,
                              numparses=1)


if __name__ == '__main__':
    # This will be the unit test

    test_text = u"It was now reading the sign that said Privet Drive — no, looking at the sign; " \
              "cats couldn't read maps or signs.He didn't see the owls swooping past in broad daylight, " \
              "though people down in the street did; they pointed and gazed open-mouthed as owl after " \
              "owl sped overhead"

    pt = ParseText(test_text)

    noun_chunks = pt.noun_chunks()

    parsed = pt.dependency_parse()
    log.info("parse " + parsed)

    #TODO Broken
    #pt.chart_parse()

    #Can't be run in unit test
    pt.display_parse()
Example #16
0
if __name__ == '__main__':
    #This will be the unit test


    test_text_supervised = [("Great place to be when you are in Bangalore.", "pos"),
                            ("The place was being renovated when I visited so the seating was limited.", "neg"),
                            ("Loved the ambience, loved the food", "pos"),
                            ("The food is delicious but not over the top.", "neg"),
                            ("Service - Little slow, probably because too many people.", "neg"),
                            ("The place is not easy to locate", "neg"),
                            ("Mushroom fried rice was spicy", "pos"),
                            ]

    test_text_unsupervised = ["Great place to be when you are in Bangalore.",
                              "The place was being renovated when I visited so the seating was limited.",
                              "Loved the ambience, loved the food", "The food is delicious but not over the top.",
                              "Service - Little slow, probably because too many people.",
                              "The place is not easy to locate", "Mushroom fried rice was tasty"]

    sent= Sentiment()

    sent.naiveBayesSentimentFit(test_text_supervised)

    test_data = "Manchurian was hot and spicy"
    nb_sentiment =sent.naiveBayesSentimentPredict(test_data)

    polarity_scores = sent.valenceSentiment(test_text_unsupervised)

    log.info("Logging polarity scores "+  " ".join(polarity_scores ) )
    log.info("done")
def test_Featureize():
    # This will be the unit test
    test_text = "He received multiple nominations for Nobel Prize in Literature every year from 1902 to 1906, \
         and nominations for Nobel Peace Prize in 1901, 1902 and 1910, and his miss of the prize is a major Nobel \
         prize controversy.[3][4][5][6] Born to an aristocratic Russian family in 1828,[2] he is best known for \
          the novels War and Peace (1869) and Anna Karenina (1877),[7] often cited as pinnacles of realist fiction. \
        [2] He first achieved literary acclaim in his twenties with his semi-autobiographical trilogy,  \
        Childhood, Boyhood, and Youth (1852–1856), and Sevastopol Sketches (1855), based upon his experiences \
        in the Crimean War."

    sw = StopWord(test_text.split())
    swList = StopWord.SwLibEnum.spacy_sw

    text = sw.remove(swList)

    log.info(text)

    text = " ".join(text)

    feat = Featurize(text)

    text_tf = feat.tf()

    log.info(text_tf)

    log.info(len(test_text.split()))

    log.info(len(set(test_text.split())))

    text_tf_idf = feat.tf_idf()

    log.info(text_tf_idf)

    text_vectors = feat.wtv_spacy()

    #feat.pca_wv(text_vectors)

    log.info("done")
Example #18
0
        return stopword_processed


if __name__ == '__main__':

    # This will be the unit test
    test_text = "He received multiple nominations for Nobel Prize in Literature every year from 1902 to 1906, \
     and nominations for Nobel Peace Prize in 1901, 1902 and 1910, and his miss of the prize is a major Nobel \
     prize controversy.[3][4][5][6] Born to an aristocratic Russian family in 1828,[2] he is best known for \
      the novels War and Peace (1869) and Anna Karenina (1877),[7] often cited as pinnacles of realist fiction. \
    [2] He first achieved literary acclaim in his twenties with his semi-autobiographical trilogy,  \
    Childhood, Boyhood, and Youth (1852–1856), and Sevastopol Sketches (1855), based upon his experiences \
    in the Crimean War."

    token = nltk.RegexpTokenizer(r'[a-zA-Z]+')
    word_tokens = token.tokenize(test_text)
    sw = StopWord(word_tokens)

    swList = StopWord.SwLibEnum.spacy_sw
    test_text_sw_removed_spacy = sw.remove(swList)
    log.info(test_text_sw_removed_spacy)

    swList = StopWord.SwLibEnum.scikit_sw
    test_text_sw_removed_scikit = sw.remove(swList)
    log.info(test_text_sw_removed_scikit)

    swList = StopWord.SwLibEnum.nltk_sw
    test_text_sw_removed_nltk = sw.remove(swList)
    log.info(test_text_sw_removed_nltk)

    log.info("done")
    ignore_index=True)
print(results)

rfc = RandomForestClassifier()
p, r, f = run_classifier(pd.DataFrame(unigram_corpus_array), rfc, 5)
results = results.append(
    {
        'Classifier': 'RandomForestClassifier',
        'Corpus type': 'Bag of words - Unigram',
        'mean_Precision': p,
        'mean_Recall': r,
        'mean_F1-score': f
    },
    ignore_index=True)

p, r, f = run_classifier(lda_corpus, rfc, 5)
results = results.append(
    {
        'Classifier': 'RandomForestClassifier',
        'Corpus type': 'LDA Term Topics',
        'mean_Precision': p,
        'mean_Recall': r,
        'mean_F1-score': f
    },
    ignore_index=True)
log.info(results)

results.to_csv('doc_classification_results.txt', sep='\t')

print('done')
Example #20
0
    def porter_stemmer(self, text):
        stem = PorterStemmer()
        split_text = text.split()
        stemmed_words = list()
        for word in split_text:
            stemmed = stem.stem(word)
            stemmed_words.append(stemmed)
        stemmed_text = " ".join(stemmed_words)
        return stemmed_text


if __name__ == '__main__':
    # This will be the unit test

    test_text = "This is the test text. Documents made up of words and/or phrases. \
    The model consists of two tables; the first table is the probability of selecting  \
    a particular word in the corpus when sampling from a particular topic, and the second \
    table is the probability of selecting a particular topic when sampling from a particular document."

    lem = Lemmatize()

    result = lem.lemmatize_nltk_with_POS(test_text)
    log.info(result)

    result = lem.lemmatize_spacy(test_text)
    log.info(result)

    result = lem.porter_stemmer(test_text)
    log.info(result)
Example #21
0
        # Use LDA to preprocess - later make a base class and refactor.
        lda = LDAAnalysis(docs)
        lda.docs_preprocessor()
        docs = lda.docs
        pickle_LDAAnalysis = open("data/cache/LDAAnalysisPreprocessed.pkl",
                                  "wb")
        pickle.dump(lda, pickle_LDAAnalysis)
        pickle_LDAAnalysis.close()
    else:
        with open("data/cache/LDAAnalysisPreprocessed.pkl",
                  'rb') as pickle_file:
            lda = pickle.load(pickle_file)
            docs = lda.docs

    with open("data/cache/LDAAnalysis.pkl", 'rb') as pickle_file:
        lda = pickle.load(pickle_file)
        docs = lda.docs

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=10, no_above=0.2)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    log.info('Number of unique tokens: %d' % len(dictionary))
    log.info('Number of documents: %d' % len(corpus))
    log.info(corpus[:1])
    temp = dictionary[0]  # only to "load" the dictionary.
    id2word = dictionary.id2token

    corpus = lda.corpus
    ctm = CTMModel(corpus, num_topics=25, id2word=id2word)

    print("done")