def main():
	nlp = spacy.load('en')
	#nlp = whitespace_nlp_with_sentences
	convention_df = SampleCorpora.ConventionData2012.get_data()
	convention_df['parsed'] = convention_df.text.apply(nlp)
	corpus = (CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='parsed')
	          .build()
	          .get_unigram_corpus())
	model = word2vec.Word2Vec(size=100,
	                          alpha=0.025,
	                          window=5,
	                          min_count=5,
	                          max_vocab_size=None,
	                          sample=0,
	                          seed=1,
	                          workers=1,
	                          min_alpha=0.0001,
	                          sg=1,
	                          hs=1,
	                          negative=0,
	                          cbow_mean=0,
	                          iter=10,
	                          null_word=0,
	                          trim_rule=None,
	                          sorted_vocab=1)
	html = word_similarity_explorer_gensim(corpus,
	                                       category='democrat',
	                                       target_term='jobs',
	                                       category_name='Democratic',
	                                       not_category_name='Republican',
	                                       minimum_term_frequency=5,
	                                       width_in_pixels=1000,
	                                       metadata=convention_df['speaker'],
	                                       word2vec=Word2VecFromParsedCorpus(corpus, model).train(),
	                                       term_significance=ScaledFScoreSignificance(),
	                                       max_p_val=0.05,
	                                       save_svg_button=True,
	                                       d3_url='scattertext/data/viz/scripts/d3.min.js',
	                                       d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js')
	open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8'))
	print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
Esempio n. 2
0
def main():
    nlp = spacy.en.English()
    convention_df = SampleCorpora.ConventionData2012.get_data()
    convention_df['parsed'] = convention_df.text.apply(nlp)
    corpus = CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parsed').build()
    model = word2vec.Word2Vec(size=300,
                              alpha=0.025,
                              window=5,
                              min_count=5,
                              max_vocab_size=None,
                              sample=0,
                              seed=1,
                              workers=1,
                              min_alpha=0.0001,
                              sg=1,
                              hs=1,
                              negative=0,
                              cbow_mean=0,
                              iter=1,
                              null_word=0,
                              trim_rule=None,
                              sorted_vocab=1)
    html = word_similarity_explorer_gensim(corpus,
                                           category='democrat',
                                           category_name='Democratic',
                                           not_category_name='Republican',
                                           target_term='jobs',
                                           minimum_term_frequency=5,
                                           pmi_filter_thresold=4,
                                           width_in_pixels=1000,
                                           metadata=convention_df['speaker'],
                                           word2vec=Word2VecFromParsedCorpus(
                                               corpus, model).train(),
                                           max_p_val=0.1,
                                           save_svg_button=True)
    open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8'))
    print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
build_corpus = CorpusFromParsedDocuments(df_2, category_col='author', parsed_col='parsed').build()

build_model = word2vec.Word2Vec(size=300,alpha=0.025,window=5,min_count=5,max_vocab_size=None,
                          sample=0,
                          seed=1,
                          workers=1,
                          min_alpha=0.0001,
                          sg=1,
                          hs=1,
                          negative=0,
                          cbow_mean=0,
                          iter=1,
                          null_word=0,
                          trim_rule=None,
                          sorted_vocab=1)

html = word_similarity_explorer_gensim(build_corpus,
                                       category='GLOBE EDITORIAL',
                                       category_name='GLOBE EDITORIAL',
                                       not_category_name='Jeffrey Simpson',
                                       target_term='obama',
                                       minimum_term_frequency=100,
                                       pmi_threshold_coefficient=4,
                                       width_in_pixels=1000,
                                       metadata=df_2['author'],
                                       word2vec=Word2VecFromParsedCorpus(build_corpus, build_model).train(),
                                       max_p_val=0.05,
                                       save_svg_button=True)

open('../output/gensim_similarity_top_2_authors.html', 'wb').write(html.encode('utf-8'))