Ejemplo n.º 1
0
	def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		#corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
    def test_hamlet(self):
        raw_docs = get_hamlet_docs()
        categories = [
            get_hamlet_snippet_binary_category(doc) for doc in raw_docs
        ]
        docs = [whitespace_nlp(doc) for doc in raw_docs]
        df = pd.DataFrame({'category': categories, 'parsed': docs})
        corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
        corpus = corpus_fact.build()
        tdf = corpus.get_term_freq_df()
        self.assertEqual(list(tdf.ix['play']), [37, 5])
        self.assertFalse(
            any(
                corpus.search('play').apply(
                    lambda x: 'plfay' in str(x['parsed']), axis=1)))
        self.assertTrue(
            all(
                corpus.search('play').apply(
                    lambda x: 'play' in str(x['parsed']), axis=1)))

        # !!! to do verify term doc matrix
        play_term_idx = corpus_fact._term_idx_store.getidx('play')
        play_X = corpus_fact._X.todok()[:, play_term_idx]

        self.assertEqual(play_X.sum(), 37 + 5)
	def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		# corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
Ejemplo n.º 4
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs})
		cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
	def test_hamlet(self):
		raw_docs = get_hamlet_docs()
		categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs]
		docs = [whitespace_nlp(doc) for doc in raw_docs]
		df = pd.DataFrame({'category': categories,
		                   'parsed': docs})
		corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus = corpus_fact.build()
		tdf = corpus.get_term_freq_df()
		self.assertEqual(list(tdf.ix['play']), [37, 5])
		self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1)))
		self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1)))

		# !!! to do verify term doc matrix
		play_term_idx = corpus_fact._term_idx_store.getidx('play')
		play_X = corpus_fact._X.todok()[:, play_term_idx]

		self.assertEqual(play_X.sum(), 37 + 5)
Ejemplo n.º 6
0
def main():
    # convention_df = SampleCorpora.ConventionData2012.get_data()
    feat_builder = FeatsFromOnlyEmpath()
    # corpus = CorpusFromParsedDocuments(convention_df,
    #                                    category_col='party',
    #                                    parsed_col='text',
    #                                    feats_from_spacy_doc=feat_builder).build()
    # html = produce_scattertext_explorer(corpus,
    #                                     category='democrat',
    #                                     category_name='Democratic',
    #                                     not_category_name='Republican',
    #                                     width_in_pixels=1000,
    #                                     metadata=convention_df['speaker'],
    #                                     use_non_text_features=True,
    #                                     use_full_doc=True,
    #                                     topic_model_term_lists=feat_builder.get_top_model_term_lists())

    # ================================================================================
    all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds(
    )
    # print("all_satisfaction_score_comment_in_all_conds",all_satisfaction_score_comment_in_all_conds)
    # [['negative', 'Satisfaction', 'after a week----mouth ulccers,cudnt talk,eat,drink for 5 days....whole body burnt,headache, fatigue....quit---am slowly getting better, wudnt give to my worst

    # print("all_satisfaction_score_comment_in_all_conds",len(all_satisfaction_score_comment_in_all_conds))
    # 1402

    # ================================================================================
    columns = ['senti_on_Metfor_oral', 'feature', 'review']
    all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame(
        all_satisfaction_score_comment_in_all_conds,
        index=None,
        columns=columns)

    # ================================================================================
    corpus = CorpusFromParsedDocuments(
        all_satisfaction_score_comment_in_all_conds_df,
        category_col='senti_on_Metfor_oral',
        parsed_col='review',
        feats_from_spacy_doc=feat_builder).build()

    # ================================================================================
    html = produce_scattertext_explorer(
        corpus,
        category='negative',
        category_name='Negative',
        not_category_name='Positive',
        width_in_pixels=1000,
        metadata=all_satisfaction_score_comment_in_all_conds_df['feature'],
        use_non_text_features=True,
        use_full_doc=True,
        topic_model_term_lists=feat_builder.get_top_model_term_lists())

    # ================================================================================
    open(
        '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/Convention-Visualization-Empath.html',
        'wb').write(html.encode('utf-8'))
    print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
Ejemplo n.º 7
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
 def _make_political_corpus(self):
     clean = clean_function_factory()
     data = []
     for party, speech in iter_party_speech_pairs():
         cleaned_speech = clean(speech)
         if cleaned_speech and cleaned_speech != '':
             parsed_speech = whitespace_nlp(cleaned_speech)
             data.append({'party': party, 'text': parsed_speech})
     corpus = CorpusFromParsedDocuments(pd.DataFrame(data),
                                        category_col='party',
                                        parsed_col='text').build()
     return corpus
Ejemplo n.º 9
0
def build_hamlet_jz_corpus():
    # type: () -> Corpus
    categories, documents = get_docs_categories()
    clean_function = lambda text: '' if text.startswith('[') else text
    df = pd.DataFrame({
        'category':
        categories,
        'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
    })
    df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
    return CorpusFromParsedDocuments(df=df,
                                     category_col='category',
                                     parsed_col='parsed').build()
Ejemplo n.º 10
0
def main():
	df = pd.read_csv('https://cdn.rawgit.com/JasonKessler/scattertext/e508bf32/scattertext/data/chinese.csv')
	df['text'] = df['text'].apply(chinese_nlp)
	corpus = CorpusFromParsedDocuments(df,
	                                   category_col='novel',
	                                   parsed_col='text').build()
	html = produce_scattertext_explorer(corpus,
	                                    category='Tale of Two Cities',
	                                    category_name='Tale of Two Cities',
	                                    not_category_name='Ulysses',
	                                    width_in_pixels=1000,
	                                    metadata=df['novel'],
	                                    chinese_mode=True)
	open('./demo_chinese.html', 'w').write(html)
	print('Open ./demo_chinese.html in Chrome or Firefox.')
Ejemplo n.º 11
0
 def setUp(cls):
     cls.categories, cls.documents = get_docs_categories()
     cls.parsed_docs = []
     for doc in cls.documents:
         cls.parsed_docs.append(whitespace_nlp(doc))
     cls.df = pd.DataFrame({
         'category': cls.categories,
         'parsed': cls.parsed_docs,
         'orig': [d.upper() for d in cls.documents]
     })
     cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                                   'parsed').build()
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'orig',
                                   nlp=whitespace_nlp).build()
Ejemplo n.º 12
0
def main():
    convention_df = SampleCorpora.ConventionData2012.get_data()

    corpus = CorpusFromParsedDocuments(
        convention_df,
        category_col='party',
        parsed_col='text',
        feats_from_spacy_doc=FeatsFromOnlyEmpath()).build()
    html = produce_scattertext_explorer(corpus,
                                        category='democrat',
                                        category_name='Democratic',
                                        not_category_name='Republican',
                                        width_in_pixels=1000,
                                        metadata=convention_df['speaker'],
                                        use_non_text_features=True,
                                        use_full_doc=True)
    open('./Convention-Visualization-Empath.html',
         'wb').write(html.encode('utf-8'))
    print('Open ./Convention-Visualization-Empath.html in Chrome or Firefox.')
Ejemplo n.º 13
0
def build_hamlet_jz_corpus_with_meta():
    # type: () -> Corpus
    def empath_mock(doc, **kwargs):
        toks = doc.split()
        num_toks = min(3, len(toks))
        return {
            'cat' + str(len(tok)): val
            for val, tok in enumerate(toks[:num_toks])
        }

    categories, documents = get_docs_categories()
    clean_function = lambda text: '' if text.startswith('[') else text
    df = pd.DataFrame({
        'category':
        categories,
        'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents]
    })
    df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)]
    return CorpusFromParsedDocuments(
        df=df,
        category_col='category',
        parsed_col='parsed',
        feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath(
            empath_analyze_function=empath_mock)).build()
                              0: "parsed_text"
                          }).reset_index(drop=True)

original_data = df.reset_index(drop=True)

df = pd.concat([original_data, cleaned_texts], axis=1)

df['parsed_text'] = df['parsed_text'].apply(chinese_nlp)

for i in np.arange(len(df['text'])):
    df['text'][i] = re.sub(pattern, '', df['text'][i])

df['text'] = df['text'].apply(chinese_nlp)

corpus = CorpusFromParsedDocuments(df,
                                   category_col='file_name',
                                   parsed_col='parsed_text').build()

html = produce_scattertext_explorer(corpus,
                                    category='安利蛋白粉评论.txt',
                                    category_name='安利蛋白粉评论.txt',
                                    not_category_name='汤臣倍健蛋白粉评论.txt',
                                    width_in_pixels=1000,
                                    metadata=df['file_name'],
                                    asian_mode=True,
                                    alternative_text_field="text")
open(
    'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein_review_compare.html',
    'w',
    encoding='utf-8').write(html)
print(
def get_scattertext_html():
    file_names = os.listdir(
        'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein review')

    # Create Dictionary for File Name and Text
    file_name_and_text = {}
    for file in file_names:
        with open(
                'C:/Users/CNU074VP/Desktop/Chinese Topic Model/protein review/'
                + file,
                "r",
                encoding="UTF-8") as target_file:
            file_name_and_text[file] = target_file.read()
    file_data = (pd.DataFrame.from_dict(file_name_and_text,
                                        orient='index').reset_index().rename(
                                            index=str,
                                            columns={
                                                'index': 'file_name',
                                                0: 'text'
                                            }))

    df = file_data

    for i in np.arange(len(df)):
        df['text'][i] = "\n".join(
            list(dict.fromkeys(df['text'][i].split("\n"))))  #Remove duplicates

    comment = df.text.values.tolist()

    #load user-defiend dictionary
    jieba.load_userdict('C:/users/CNU074VP/dict_out.csv')

    #word segmentation with jieba
    comment_s = []
    # pattern = re.compile(r'[\u4e00-\u9fa5]+')   #Get rid of non-Chinese string, need to keep 换行符\n

    pattern = re.compile('\<.*?\>')  #正则去除淘宝评论里<>里边的内容
    for line in comment:
        line.replace(' ', '')
        # line = ''.join(re.findall(pattern, line))
        line = ''.join(re.sub(pattern, '', line))
        comment_cut = jieba.lcut(line)
        comment_s.append(comment_cut)

    # load user-defined stop words list
    stopwords = pd.read_excel(
        'C:/users/CNU074VP/PycharmProjects/tmall_spider/stopwords.xlsx')
    stopwords = stopwords.stopword.values.tolist()

    # get rid of stop words
    comment_clean = []
    for line in comment_s:
        line_clean = []
        for word in line:
            if word not in stopwords:
                line_clean.append(word)
        comment_clean.append(line_clean)

    comment_doc = []

    def get_single_doc(num):
        for i in np.arange(len(comment_clean[num])):
            comment_doc = ' '.join([str(item) for item in comment_clean[num]])
        return comment_doc

    l_series = []
    for i in np.arange(len(df)):
        l_series.append(pd.Series(get_single_doc(i)))

    cleaned_texts = pd.concat(
        l_series, ignore_index=True).to_frame().rename(columns={
            0: "parsed_text"
        }).reset_index(drop=True)

    original_data = df.reset_index(drop=True)

    df = pd.concat([original_data, cleaned_texts], axis=1)

    df['parsed_text'] = df['parsed_text'].apply(chinese_nlp)

    for i in np.arange(len(df['text'])):
        df['text'][i] = re.sub(pattern, '', df['text'][i])

    df['text'] = df['text'].apply(chinese_nlp)

    corpus = CorpusFromParsedDocuments(df,
                                       category_col='file_name',
                                       parsed_col='parsed_text').build()

    html = produce_scattertext_explorer(corpus,
                                        category='安利蛋白粉评论.txt',
                                        category_name='安利蛋白粉评论.txt',
                                        not_category_name='汤臣倍健蛋白粉评论.txt',
                                        width_in_pixels=1000,
                                        metadata=df['file_name'],
                                        asian_mode=True,
                                        alternative_text_field="text")
    result = open('D:/scattertext/protein_review_compare.html',
                  'w',
                  encoding='utf-8').write(html)

    return result
 def setUp(cls):
     categories, documents = get_docs_categories()
     cls.df = pd.DataFrame({'category': categories, 'text': documents})
     cls.df['parsed'] = cls.df.text.apply(whitespace_nlp)
     cls.corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                            'parsed').build()
Ejemplo n.º 17
0
import spacy
import numpy as np
import pytextrank

nlp = spacy.load('en')

convention_df = SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(nlp),
    party=lambda df: df.party.apply({
        'democrat': 'Democratic',
        'republican': 'Republican'
    }.get))

corpus = CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=PyTextRankPhrases()).build().compact(
        AssociationCompactor(2000, use_non_text_features=True))

print('Aggregate PyTextRank phrase scores')
term_category_scores = corpus.get_metadata_freq_df('')
print(term_category_scores)

term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1

metadata_descriptions = {
    term: '<br/>' +
    '<br/>'.join('<b>%s</b> TextRank score rank: %s/%s' %
                 (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
                 for cat in corpus.get_categories())
    for term in corpus.get_metadata()
def build_hamlet_jz_corpus_with_alt_text():
    # type: () -> Corpus
    df = build_hamlet_jz_df_with_alt_text()
    return CorpusFromParsedDocuments(df=df,
                                     category_col='category',
                                     parsed_col='parsed').build()