def testCase(): """ Get docs from file, get list of titles+content, calculate (40) topics using 3,4 ngrams map topics Add list of topics to each entry of the given allEntryDict for each topic that has an LDA fuzzy relevance (see fuzzywuzzy process) of greater than the specified threshold. Calculate SoftCosine-Similarity matrix with WordEmbeddings fasttext_model300 (dimension 300) or GloVe (dimension 50) save matrix to file do spectral analysis and dimension reduction (PCA method) on similarity matrix plotScatter3D with tool tips """ allDict = loadAllFeedsFromFile() sm = smallDict(allDict, 500) docl = getDocList(sm, limit=None, reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=20, ngram_range=(3, 4)) updateDictionaryByFuzzyRelevanceofTopics(topics, sm, limit=None, threshold=60, remove=True) trix = deriveSoftCosineSimilarityMatrix(sm) saveDFPickle(trix) do3DPlotOfCosineSimilarity(sm, None, trix) return
def preparePyLDAvisData(allDict, limit=None, numTopics=30): docsZip = getDocList(allDict, limit, stop_list=getCustomStopWords(), with_ids=True) documents = [] ids = [] for i, j in docsZip: documents.append(j) ids.append(i) # Create gensim Dictionary of unique IDs of all words in all documents # pyDAVis param "d" dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Convert the sentences into bag-of-words vectors. sentences = [] # pyDAVis param "c" for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) ldamodel = models.ldamodel.LdaModel(sentences, num_topics=numTopics, id2word=dictionary, passes=50) return (ldamodel, sentences, dictionary)
def preparePyLDAvisData(allDict, limit=None, numTopics=30): docsZip = getDocList(allDict, limit, stop_list=getCustomStopWords(), with_ids=True) documents = [] ids = [] for i, j in docsZip: documents.append(j) ids.append(i) model = getWordEmbeddingModel() # Create gensim Dictionary of unique IDs of all words in all documents # pyDAVis param "d" dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Prepare the similarity matrix # TODO Check if some of these parameters can be used to begin with rather than filtering later similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Convert the sentences into bag-of-words vectors. sentences = [] # pyDAVis param "c" for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) ldamodel = models.ldamodel.LdaModel(sentences, num_topics=numTopics, id2word=dictionary, passes=50) return (ldamodel, sentences, dictionary)
def deriveSoftCosineSimilarityMatrix(allDict, limit=None, weName="glove-wiki-gigaword-50", simThreshold=0.3): # documents=getTestDocuments() docsZip = getDocList(allDict, limit, stop_list=getCustomStopWords(), with_ids=True) documents = [] ids = [] for i, j in docsZip: documents.append(j) ids.append(i) model = getWordEmbeddingModel(weName=weName) # Create gensim Dictionary of unique IDs of all words in all documents # pyDAVis param "d" dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Convert the sentences into bag-of-words vectors. sentences = [] # pyDAVis param "c" for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) # Create a TF-IDF model. TF-IDF encoding represents words as their # relative importance to the whole document in a collection of documents, # i.e. the sentences. # pyDAVis param "lda" tf_idf = models.TfidfModel(sentences) # Prepare the similarity matrix similarity_matrix = model.similarity_matrix(dictionary, tfidf=tf_idf, threshold=simThreshold, exponent=2.0, nonzero_limit=100) # create 1xN vector filled with 1,2,..N len_array = np.arange(len(sentences)) # create NxN array filled with 1..N down, 1..N across xx, yy = np.meshgrid(len_array, len_array) # Iterate over the 2d matrix calculating theMatrix = [[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)] cossim_mat = pd.DataFrame(theMatrix, index=ids, columns=ids) return cossim_mat
def testDisplayTopics(numArticles=None, numTopics=30, dict=None): if not dict: dict = loadAllFeedsFromFile() if bool(numArticles): small = smallDict(dict, numArticles) else: small = dict docl = getDocList(small, reloaddocs=False, stop_list=getCustomStopWords()) # docl=getDocList(small, reloaddocs=False) topics = deriveTopicMaps(docl, maxNum=numTopics, ngram_range=(3, 3)) updateDictionaryByFuzzyRelevanceofTopics(topics, small, limit=30) displayTopics(topics) return
def runSentiment(allDict, sm): allDict = loadAllFeedsFromFile() sm = smallDict(allDict, 200) conductSentimentAnalysis(sm) docl = getDocList(sm, reloaddocs=False, stop_list=getCustomStopWords()) topics = deriveTopicMaps(docl, maxNum=30, ngram_range=(3, 3)) updateDictionaryByFuzzyRelevanceofTopics(topics, sm, limit=None, threshold=20, remove=True) gt = getTopicIdDict(sm) # tlist=[item[0] for item in topics] # top=tlist[4] # df2=getSentimentsForTopic3(top,sm) # plotSentiment3D(df2, sm, notebook=False, topic=top) return
def softCosineSimilarityTest(numtestdocs=20, weName="glove-wiki-gigaword-50"): # documents=getTestDocuments() # documents=getSampleDocs(numtestdocs) documents = getDocList(limit=numtestdocs) model = getWordEmbeddingModel(weName=weName) # Create gensim Dictionary of unique IDs of all words in all documents dictionary = corpora.Dictionary( [simple_preprocess(doc) for doc in documents]) # Prepare the similarity matrix similarity_matrix = model.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Convert the sentences into bag-of-words vectors. sentences = [] for doc in documents: sentences.append(dictionary.doc2bow(simple_preprocess(doc))) # Create a TF-IDF model. TF-IDF encoding represents words as their # relative importance to the whole document in a collection of documents, # i.e. the sentences. # tf_idf = models.TfidfModel(sentences) # print("tf_idf:", tf_idf) # create 1xN vector filled with 1,2,..N len_array = np.arange(len(sentences)) # create NxN array filled with 1..N down, 1..N across xx, yy = np.meshgrid(len_array, len_array) # Iterate over the 2d matrix calculating theMatrix = [[ round(softcossim(sentences[i], sentences[j], similarity_matrix), 2) for i, j in zip(x, y) ] for y, x in zip(xx, yy)] names = [] # for identifying rows and columns jj = 0 for doc in documents: names.append(str(jj) + " " + doc[:15] + "\t") jj += 1 cossim_mat = pd.DataFrame(theMatrix, index=names, columns=names) return cossim_mat