def test_get_df_for_words_with_no_results(words_no_results): """ This test case fails for some unknown reason Fails. Palmetto can not handle underscores. """ palmetto = Palmetto() doc_ids = palmetto.get_df_for_words(words_no_results) for i in range(0, len(words_no_results)): assert (doc_ids[i][0] == words_no_results[i])
def build_weighted_graph(synset_packs): """ Build a weighted graph out of synset packs. Return a list of tuples with weights such as: [ [((item_a, item_b), 1.534), (item_a, item_c), 1.1234], [((item_c, item_d), 1.34), (item_c, item_f), 1.24], ... ] """ # get all document frequencies # collect all terms words = [] for i in range(0, len(synset_packs)): for j in range(0, len(synset_packs[i][1])): synset_packs[i][1][j] = synset_packs[i][1][j].name().split(".")[0] words.append(synset_packs[i][1][j]) if len(synset_packs[i][1]) == 0: synset_packs[i][1].append(synset_packs[i][0]) words.append(synset_packs[i][0]) palmetto = Palmetto() doc_id_tuples = palmetto.get_df_for_words(words) doc_id_tuples_dict = dict(doc_id_tuples) edges = [] for i in range(0, len(synset_packs)): for j in range(i + 1, len(synset_packs)): edge = [] for that_word in synset_packs[j][1]: for this_word in synset_packs[i][1]: edge_item = ( ( this_word, that_word ), calculate_coherence( this_word, that_word, doc_id_tuples_dict ) ) edge.append(edge_item) edges.append(sorted(edge, key=lambda x: x[1], reverse=True)) return edges
def cluster_header_random(header): """ Cluster synsets using palmetto. Randomly select permutation of the header and calculate coherence. Repeat until algorithm converges. """ palmetto = Palmetto() synsets_pack = get_header_synsets(header) window_size = 3 window = [] maximum_coherence = 0 index = 0 no_change = 0 best_permutation = [] while True: random_permutation = _pick_random_synset_permutation(synsets_pack) coherence = palmetto.get_coherence(random_permutation) window.append( ( random_permutation, coherence ) ) if index % window_size == 0: (local_best_permutation, local_maximum_coherence) = max( window, key=lambda x: x[1] ) if local_maximum_coherence > maximum_coherence: maximum_coherence = local_maximum_coherence best_permutation = local_best_permutation else: no_change = no_change + 1 window = [] if no_change > 2: break index = index + 1 return best_permutation
def build_weighted_graph(synset_packs): """ Build a weighted graph out of synset packs. Return a list of tuples with weights such as: [ [((item_a, item_b), 1.534), (item_a, item_c), 1.1234], [((item_c, item_d), 1.34), (item_c, item_f), 1.24], ... ] """ # get all document frequencies # collect all terms words = [] for i in range(0, len(synset_packs)): for j in range(0, len(synset_packs[i][1])): synset_packs[i][1][j] = synset_packs[i][1][j].name().split(".")[0] words.append(synset_packs[i][1][j]) if len(synset_packs[i][1]) == 0: synset_packs[i][1].append(synset_packs[i][0]) words.append(synset_packs[i][0]) palmetto = Palmetto() doc_id_tuples = palmetto.get_df_for_words(words) doc_id_tuples_dict = dict(doc_id_tuples) edges = [] for i in range(0, len(synset_packs)): for j in range(i + 1, len(synset_packs)): edge = [] for that_word in synset_packs[j][1]: for this_word in synset_packs[i][1]: edge_item = ((this_word, that_word), calculate_coherence(this_word, that_word, doc_id_tuples_dict)) edge.append(edge_item) edges.append(sorted(edge, key=lambda x: x[1], reverse=True)) return edges
def coherence_v(keyphrases): top_words = [] for phrase in keyphrases: clean_phrase = preprocess_text(phrase) words = clean_phrase.split(' ') top_words.extend(words) top_words = list(set(top_words)) #CEK KOHERENSI TOPIK PER FRASE top_words = keyphrases #print(top_words) #print('n before : ', len(top_words)) flag = 0 while (flag == 0): try: palmetto = Palmetto() score = palmetto.get_coherence(top_words) flag = 1 #print ('n after : ', len(top_words)) except EndpointDown: top_words = top_words[:len(top_words) - 1] return score
def cluster_header_random(header): """ Cluster synsets using palmetto. Randomly select permutation of the header and calculate coherence. Repeat until algorithm converges. """ palmetto = Palmetto() synsets_pack = get_header_synsets(header) window_size = 3 window = [] maximum_coherence = 0 index = 0 no_change = 0 best_permutation = [] while True: random_permutation = _pick_random_synset_permutation(synsets_pack) coherence = palmetto.get_coherence(random_permutation) window.append((random_permutation, coherence)) if index % window_size == 0: (local_best_permutation, local_maximum_coherence) = max(window, key=lambda x: x[1]) if local_maximum_coherence > maximum_coherence: maximum_coherence = local_maximum_coherence best_permutation = local_best_permutation else: no_change = no_change + 1 window = [] if no_change > 2: break index = index + 1 return best_permutation
def writeCoherence(topicWords4Epochs, path, mixedFile='', command=[False,False], info=''): file = os.path.join(path, '{}.txt'.format(info)) if command[0]: CohWriterCV = SummaryWriter(os.path.join(path,'runs/coh{}_cv'.format(info))) corpus, text, id2word = buildCorpusDict(mixedFile) if command[1]: CohWriterUmass = SummaryWriter(os.path.join(path,'runs/coh{}_umass'.format(info))) pmt = Palmetto() widgets = ['writing {}: '.format(info), Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=(len(topicWords4Epochs))) pbar.start() for i, topicWords in enumerate(topicWords4Epochs): if command[0]: try: cm = CoherenceModel(topics=topicWords, corpus=corpus, texts=text, dictionary=id2word, coherence='c_v') coherence = cm.get_coherence() CohWriterCV.add_scalar('coherenceCV', coherence, i) coherences = cm.get_coherence_per_topic() except: coherence=0; coherences=0 else: coherence=0; coherences=[0 for i in range(len(topicWords))] if command[1]: try: coherences2 = [] for topic in topicWords: coherence = pmt(topic, coherence_type='umass') coherences2.append(coherence) coherence2 = sum(coherences2)/len(coherences2) CohWriterUmass.add_scalar('coherenceUMASS', coherence2 , i) coherenceList = [coherences, coherence, coherences2, coherence2] except: coherence2=0; coherences2=[0 for i in range(len(topicWords))] else: coherence2=0; coherences2=0 coherenceList = [coherences, coherence, coherences2, coherence2] writeTWC(topicWords, coherenceList, file, 'article', i, command) pbar.update(i+1) pbar.finish() return
def test_wrong_coherence_type(words): palmetto = Palmetto() with pytest.raises(CoherenceTypeNotAvailable): coherence = palmetto.get_coherence(words, coherence_type="asdf")
import sys import palmettopy.exceptions from palmettopy.palmetto import Palmetto words = [ "cherry", "pie", "cr_eam", "apple", "orange", "banana", "pineapple", "plum", "pig", "cra_cker", "so_und", "kit" ] palmetto = Palmetto() try: result = palmetto.get_df_for_words(words) sys.exit(0) except palmettopy.exceptions.EndpointDown: sys.exit(1)
def __init__(self, **kwargs): print kwargs self.palmetto = Palmetto() self.output_folder_path = kwargs.get('output_folder_path', "output/") self.output_file_name = kwargs.get('output_file_name', "results") self.vocab_file = kwargs.get('vocab_file', "top1grams-wiki.txt") self.word_vec_file = kwargs.get('word_vec_file', "25000-500-EM.vec") self.topic_vec_file = kwargs.get('topic_vec_file', None) self.W = kwargs.get('load_embedding_word_count', -1) K = kwargs.get('K', 30) self.max_l = kwargs.get('max_l', 5) self.init_l = kwargs.get('init_l', 1) self.max_grad_norm = kwargs.get('max_grad_norm', 1.0) self.max_grad_norm_fraction = kwargs.get('max_grad_norm_fraction', 0.2) self.grad_scale_Em_base = kwargs.get('grad_scale_Em_base', 0) # number of top words to output into logfile self.topW = kwargs.get('topW', 12) # output the first 'topDim' dimensions of T, for debugging self.topDim = kwargs.get('topDim', 10) self.topTopicMassFracPrintThres = kwargs.get( 'topTopicMassFracPrintThres', 1) # Dirichlet parameter for the null topic self.alpha0 = kwargs.get('alpha0', 5) # Dirichlet parameter for all other topics self.alpha1 = kwargs.get('alpha1', 1) # initial learning rate self.delta = self.iniDelta = kwargs.get('iniDelta', 0.1) self.MAX_EM_ITERS = kwargs.get('MAX_EM_ITERS', 200) self.topicDiff_tolerance = kwargs.get('topicDiff_tolerance', 1e-2) # whether fix topic 0 to null topic self.zero_topic0 = kwargs.get('zero_topic0', True) self.appendLogfile = kwargs.get('appendLogfile', False) self.customStopwords = kwargs.get('customStopwords', "") self.remove_stop = kwargs.get('remove_stop', True) self.seed = kwargs.get('seed', 0) self.verbose = kwargs.get('verbose', 1) # print topics every so many iters self.printTopics_iterNum = kwargs.get('printTopics_iterNum', 20) # compute sum_pi_v is slow. Approximate it by calculating it every few iters to speed up self.calcSum_pi_v_iterNum = kwargs.get('calcSum_pi_v_iterNum', 1) # do V-step every few M-steps to speed up. Default: 1 (each M-step) self.VStep_iterNum = kwargs.get('VStep_iterNum', 1) self.calcLike_iterNum = kwargs.get('calcLike_iterNum', 1) self.useDrdtApprox = kwargs.get('useDrdtApprox', False) self.Mstep_sample_topwords = kwargs.get('Mstep_sample_topwords', 0) self.normalize_vecs = kwargs.get('normalize_vecs', False) self.rebase_vecs = kwargs.get('rebase_vecs', False) self.rebase_norm_thres = kwargs.get('rebase_norm_thres', 0) self.evalKmeans = kwargs.get('evalKmeans', False) self.D = 0 self.docsName = "Uninitialized" self.vocab_dict = loadVocabFile(self.vocab_file) embedding_file_name = self.word_vec_file.rsplit('/', 1)[1] embedding_npyfile = self.output_folder_path + embedding_file_name + '.npy' self.V, self.vocab, self.word2ID, skippedWords_whatever = load_embeddings( self.word_vec_file, self.W) embedding_arrays = np.array( [self.V, self.vocab, self.word2ID, skippedWords_whatever]) np.save(embedding_npyfile, embedding_arrays) # map of word -> id of all words with embeddings vocab_dict2 = {} if self.normalize_vecs: self.V = normalizeF(self.V) # dimensionality of topic/word embeddings self.N0 = self.V.shape[1] # number of all words self.vocab_size = self.V.shape[0] # set unigram probs u2 = [] oovcount = 0 unigram_oov_prior = 0.000001 for wid, w in enumerate(self.vocab): if w not in self.vocab_dict: oovcount += 1 u2.append(unigram_oov_prior) else: # u2.append( self.vocab_dict[w][2] ) u2.append(unigram_oov_prior) vocab_dict2[w] = wid if oovcount > 0: print "%d words in '%s' but not in '%s'. Unigram prob set to oov prior %.3g" % ( oovcount, self.word_vec_file, self.vocab_file, unigram_oov_prior) u2 = np.array(u2) self.u = normalize(u2) # structure of vocab_dict changed here. Original vocab_dict is w->[id, freq, unigram_prob] # now vocab_dict is only w->id self.vocab_dict = vocab_dict2 # u2 is the top "Mstep_sample_topwords" words of u, # used for a sampling inference (i.e. only the most # important "Mstep_sample_topwords" words are used) in the M-step # if Mstep_sample_topwords == 0, sampling is disabled if self.Mstep_sample_topwords == 0: self.Mstep_sample_topwords = self.vocab_size self.u2 = self.u self.V2 = self.V else: self.u2 = self.u[:self.Mstep_sample_topwords] self.u2 = normalize(self.u2) self.V2 = self.V[:self.Mstep_sample_topwords] customStopwordList = re.split(r"\s+", self.customStopwords) for stop_w in customStopwordList: stopwordDict[stop_w] = 1 print "Custom stopwords: %s" % (", ".join(customStopwordList)) if 'fileLogger' not in kwargs: self.logfilename = kwargs.get('logfilename', "topicvecDir") self.fileLogger = initFileLogger(self.logfilename, self.appendLogfile) else: self.fileLogger = kwargs['fileLogger'] self.fileLogger.debug("topicvecDir() init at %s", time.ctime()) self.precompute() self.setK(K) self.docs_name = [] self.docs_idx = [] self.docs_wids = [] self.wid2freq = [] self.wids_freq = [] self.expVT = None self.T = self.r = self.sum_pi_v = None self.docs_L = [] self.docs_Pi = [] self.docs_theta = [] self.totalL = 0 self.kmeans_xtoc = self.kmeans_distances = None # current iteration number self.it = 0
class topicvecDir: def __init__(self, **kwargs): print kwargs self.palmetto = Palmetto() self.output_folder_path = kwargs.get('output_folder_path', "output/") self.output_file_name = kwargs.get('output_file_name', "results") self.vocab_file = kwargs.get('vocab_file', "top1grams-wiki.txt") self.word_vec_file = kwargs.get('word_vec_file', "25000-500-EM.vec") self.topic_vec_file = kwargs.get('topic_vec_file', None) self.W = kwargs.get('load_embedding_word_count', -1) K = kwargs.get('K', 30) self.max_l = kwargs.get('max_l', 5) self.init_l = kwargs.get('init_l', 1) self.max_grad_norm = kwargs.get('max_grad_norm', 1.0) self.max_grad_norm_fraction = kwargs.get('max_grad_norm_fraction', 0.2) self.grad_scale_Em_base = kwargs.get('grad_scale_Em_base', 0) # number of top words to output into logfile self.topW = kwargs.get('topW', 12) # output the first 'topDim' dimensions of T, for debugging self.topDim = kwargs.get('topDim', 10) self.topTopicMassFracPrintThres = kwargs.get( 'topTopicMassFracPrintThres', 1) # Dirichlet parameter for the null topic self.alpha0 = kwargs.get('alpha0', 5) # Dirichlet parameter for all other topics self.alpha1 = kwargs.get('alpha1', 1) # initial learning rate self.delta = self.iniDelta = kwargs.get('iniDelta', 0.1) self.MAX_EM_ITERS = kwargs.get('MAX_EM_ITERS', 200) self.topicDiff_tolerance = kwargs.get('topicDiff_tolerance', 1e-2) # whether fix topic 0 to null topic self.zero_topic0 = kwargs.get('zero_topic0', True) self.appendLogfile = kwargs.get('appendLogfile', False) self.customStopwords = kwargs.get('customStopwords', "") self.remove_stop = kwargs.get('remove_stop', True) self.seed = kwargs.get('seed', 0) self.verbose = kwargs.get('verbose', 1) # print topics every so many iters self.printTopics_iterNum = kwargs.get('printTopics_iterNum', 20) # compute sum_pi_v is slow. Approximate it by calculating it every few iters to speed up self.calcSum_pi_v_iterNum = kwargs.get('calcSum_pi_v_iterNum', 1) # do V-step every few M-steps to speed up. Default: 1 (each M-step) self.VStep_iterNum = kwargs.get('VStep_iterNum', 1) self.calcLike_iterNum = kwargs.get('calcLike_iterNum', 1) self.useDrdtApprox = kwargs.get('useDrdtApprox', False) self.Mstep_sample_topwords = kwargs.get('Mstep_sample_topwords', 0) self.normalize_vecs = kwargs.get('normalize_vecs', False) self.rebase_vecs = kwargs.get('rebase_vecs', False) self.rebase_norm_thres = kwargs.get('rebase_norm_thres', 0) self.evalKmeans = kwargs.get('evalKmeans', False) self.D = 0 self.docsName = "Uninitialized" self.vocab_dict = loadVocabFile(self.vocab_file) embedding_file_name = self.word_vec_file.rsplit('/', 1)[1] embedding_npyfile = self.output_folder_path + embedding_file_name + '.npy' self.V, self.vocab, self.word2ID, skippedWords_whatever = load_embeddings( self.word_vec_file, self.W) embedding_arrays = np.array( [self.V, self.vocab, self.word2ID, skippedWords_whatever]) np.save(embedding_npyfile, embedding_arrays) # map of word -> id of all words with embeddings vocab_dict2 = {} if self.normalize_vecs: self.V = normalizeF(self.V) # dimensionality of topic/word embeddings self.N0 = self.V.shape[1] # number of all words self.vocab_size = self.V.shape[0] # set unigram probs u2 = [] oovcount = 0 unigram_oov_prior = 0.000001 for wid, w in enumerate(self.vocab): if w not in self.vocab_dict: oovcount += 1 u2.append(unigram_oov_prior) else: # u2.append( self.vocab_dict[w][2] ) u2.append(unigram_oov_prior) vocab_dict2[w] = wid if oovcount > 0: print "%d words in '%s' but not in '%s'. Unigram prob set to oov prior %.3g" % ( oovcount, self.word_vec_file, self.vocab_file, unigram_oov_prior) u2 = np.array(u2) self.u = normalize(u2) # structure of vocab_dict changed here. Original vocab_dict is w->[id, freq, unigram_prob] # now vocab_dict is only w->id self.vocab_dict = vocab_dict2 # u2 is the top "Mstep_sample_topwords" words of u, # used for a sampling inference (i.e. only the most # important "Mstep_sample_topwords" words are used) in the M-step # if Mstep_sample_topwords == 0, sampling is disabled if self.Mstep_sample_topwords == 0: self.Mstep_sample_topwords = self.vocab_size self.u2 = self.u self.V2 = self.V else: self.u2 = self.u[:self.Mstep_sample_topwords] self.u2 = normalize(self.u2) self.V2 = self.V[:self.Mstep_sample_topwords] customStopwordList = re.split(r"\s+", self.customStopwords) for stop_w in customStopwordList: stopwordDict[stop_w] = 1 print "Custom stopwords: %s" % (", ".join(customStopwordList)) if 'fileLogger' not in kwargs: self.logfilename = kwargs.get('logfilename', "topicvecDir") self.fileLogger = initFileLogger(self.logfilename, self.appendLogfile) else: self.fileLogger = kwargs['fileLogger'] self.fileLogger.debug("topicvecDir() init at %s", time.ctime()) self.precompute() self.setK(K) self.docs_name = [] self.docs_idx = [] self.docs_wids = [] self.wid2freq = [] self.wids_freq = [] self.expVT = None self.T = self.r = self.sum_pi_v = None self.docs_L = [] self.docs_Pi = [] self.docs_theta = [] self.totalL = 0 self.kmeans_xtoc = self.kmeans_distances = None # current iteration number self.it = 0 def setK(self, K): self.K = K self.alpha = np.array([self.alpha1] * self.K) if self.zero_topic0: self.alpha[0] = self.alpha0 # K rows of Ev # EV: K x N0 if self.useDrdtApprox: self.EV = np.tile(self.Ev, (self.K, 1)) def precompute(self): print "Precompute matrix u_V" # each elem of u multiplies each row of V # Pw_V: Mstep_sample_topwords x N0 self.Pw_V = self.u2[:, None] * self.V2 if self.useDrdtApprox: print "Precompute vector Ev" self.Ev = np.dot(self.u, self.V) print "Precompute matrix Evv...", self.Evv = np.zeros((self.N0, self.N0)) for wid in xrange(self.vocab_size): self.Evv += self.u[wid] * np.outer(self.V[wid], self.V[wid]) print "Done." def calcEm(self, docs_Pi): Em = np.zeros(self.K) for d in xrange(len(docs_Pi)): Em += np.sum(docs_Pi[d], axis=0) return Em # this actually computes the variational lowerbound, as an approximation of the (intractable) data log-likelihood def calcLoglikelihood(self): totalLoglike = 0 for d in xrange(self.D): theta = self.docs_theta[d] Pi = self.docs_Pi[d] theta0 = np.sum(theta) entropy = np.sum(gammaln(theta)) - gammaln(theta0) entropy += (theta0 - self.K) * psi(theta0) - np.sum( (theta - 1) * psi(theta)) entropy -= np.sum(Pi * np.log(Pi)) # this Em is not the total Em calculated by calcEm() # Em[k] = sum_j Pi[j][k] Em = np.sum(Pi, axis=0) Em_Ephi = (Em + self.alpha - 1) * (psi(theta) - psi(theta0)) sum_r_pi = np.dot(Em, self.r) loglike = entropy + np.sum(Em_Ephi) + np.trace( np.dot(self.T, self.sum_pi_v.T)) + sum_r_pi totalLoglike += loglike return totalLoglike def updateTheta(self): for d in xrange(self.D): self.docs_theta[d] = np.sum(self.docs_Pi[d], axis=0) + self.alpha def updatePi(self, docs_theta): docs_Pi = [] psiDocs_theta = psi(docs_theta) for d in xrange(self.D): if d % 50 == 49 or d == self.D - 1: print "\r%d" % (d + 1), wids = self.docs_wids[d] L = self.docs_L[d] # faster computation, more memory if L <= 20000: # Vd: L x N0 Vd = self.V[wids] TV = np.dot(Vd, self.T.T) Pi = np.exp(psiDocs_theta[d] + TV + self.r) # slower but avoids using up memory else: Pi = np.zeros((L, self.K)) for i, wid in enumerate(wids): v = self.V[wid] Tv = np.dot(self.T, v) Pi[i] = np.exp(psiDocs_theta[d] + Tv + self.r) Pi = normalize(Pi) docs_Pi.append(Pi) return docs_Pi # T is fed as an argument to provide more flexibility def calcTopicResiduals(self, T): # VT_{i,j} = v_wi' t_j VT = np.dot(self.V2, T.T) # expVT_{i,j} = exp(v_wi' t_j) # used in the computation of drdt # expVT: Mstep_sample_topwords x K self.expVT = np.exp(VT) r = -np.log(np.dot(self.u2, self.expVT)) return r def updateTopicEmbeddings(self): Em = self.calcEm(self.docs_Pi) if self.grad_scale_Em_base > 0 and np.sum( Em) > self.grad_scale_Em_base: grad_scale = self.grad_scale_Em_base / np.sum(Em) else: grad_scale = 1 # Em: 1 x K vector # r: 1 x K vector # Em_exp_r: 1 x K vector Em_exp_r = Em * np.exp(self.r) # d_EwVT_dT: K x N0 d_EwVT_dT = np.dot(self.expVT.T, self.Pw_V) # Em_drdT_exact: N0 x K Em_drdT_exact = d_EwVT_dT.T * Em_exp_r # Em_drdT: K x N0 Em_drdT = Em_drdT_exact.T # dLdT, gradT: K x N0 dLdT = self.sum_pi_v - Em_drdT gradT = dLdT * self.delta * grad_scale gradTNorms = np.linalg.norm(gradT, axis=1) TNorms = np.linalg.norm(self.T, axis=1) TNorms[TNorms < 1e-2] = 1.0 gradTScale = np.ones(self.K) gradFractions = gradTNorms / TNorms for k, fraction in enumerate(gradFractions): if self.max_grad_norm_fraction > 0 and fraction > self.max_grad_norm_fraction: gradTScale[k] = self.max_grad_norm_fraction / fraction if self.max_grad_norm > 0 and TNorms[k] > self.max_grad_norm: gradTScale[k] = min(gradTScale[k], self.max_grad_norm / TNorms[k]) gradT *= gradTScale[:, None] T2 = self.T + gradT maxTStep = np.max(np.linalg.norm(gradT, axis=1)) # self.max_l == 0: do not do normalization if self.max_l > 0: for k in xrange(self.K): # do normalization only if the magnitude > self.max_l if np.linalg.norm(T2[k]) > self.max_l: T2[k] = self.max_l * normalizeF(T2[k]) if self.zero_topic0: T2[0] = np.zeros(self.N0) r2 = self.calcTopicResiduals(T2) topicDiffNorm = np.linalg.norm(self.T - T2) return T2, r2, topicDiffNorm, maxTStep # Pi: L x K # sum_pi_v: K x N0 def calcSum_pi_v(self): self.sum_pi_v = np.zeros((self.K, self.N0)) for d in xrange(self.D): Pi = self.docs_Pi[d] wids = self.docs_wids[d] #L = self.docs_L[d] #for i in xrange(L): # self.sum_pi_v += np.outer( Pi[i], self.V[ wids[i] ] ) self.sum_pi_v += np.dot(Pi.T, self.V[wids]) # the returned outputter always output to the log file # screenVerboseThres controls when the generated outputter will output to screen # when self.verbose >= screenVerboseThres, screen output is enabled # in the batch mode for multiple files, typically self.verbose == 0 # then by default no screen output anyway # in the single file mode, typically self.verbose == 1 # then in printTopWordsInTopics(), # outputToScreen == True => screenVerboseThres == 1 # with screen output # outputToScreen == False => screenVerboseThres == 2 # no screen output # in other places by default screenVerboseThres==1, with screen output def genOutputter(self, screenVerboseThres=1): def screen_log_output(s): self.fileLogger.debug(s) if self.verbose >= screenVerboseThres: print s return screen_log_output def genProgressor(self): def screen_log_progress(s): self.fileLogger.debug(s) if self.verbose == 0: print "\r%s \r" % s, else: print s return screen_log_progress # topTopicMassFracPrintThres: when a topic's fraction Em[k]/L > topTopicMassFracPrintThres/K, print it def printTopWordsInTopics(self, docs_theta, outputToScreen=False): wids2 = self.wid2freq.keys() wids_topics_sim = np.dot(normalizeF(self.V[wids2]), normalizeF(self.T).T) wids_topics_dot = np.dot(self.V[wids2], self.T.T) # row ID: de-duplicated id, also the row idx in the # matrices wids_topics_sim and wids_topics_dot wid2rowID = {} for i, wid in enumerate(wids2): wid2rowID[wid] = i # the topic prop of each word, indexed by the row ID row_topicsProp = np.zeros(wids_topics_sim.shape) # word occurrences, indexed bythe row ID row_wordOccur = np.array(self.wid2freq.values()) if self.evalKmeans: Em = np.bincount(self.kmeans_xtoc) else: docs_Pi = self.updatePi(docs_theta) Em = self.calcEm(docs_Pi) # tids is sorted topic IDs from most frequent to least frequent tids = sorted(range(self.K), key=lambda k: Em[k], reverse=True) for i, k in enumerate(tids): # below the average proportion * topTopicMassFracPrintThres if Em[k] < self.topTopicMassFracPrintThres * self.totalL / self.K: break # cut_i is the cut point of tids: tids[:cut_i] will be printed # if i==0, no topic has enough proportion to be printed. # this may happen when topicThres is too big. in this case, print the principal topic if i == 0: cut_i = 1 else: cut_i = i for d in xrange(self.D): for i in xrange(self.docs_L[d]): wid = self.docs_wids[d][i] rowID = wid2rowID[wid] if self.evalKmeans: k = self.kmeans_xtoc[rowID] row_topicsProp[rowID][k] += 1 else: row_topicsProp[rowID] += docs_Pi[d][i] # the topic prop of each word, indexed by the row ID # take account of the word freq, but dampen it with sqrt # so that more similar, less frequent words have chance to be selected # doing average does not consider freq, not good either row_topicsDampedProp = row_topicsProp / np.sqrt(row_wordOccur)[:, None] W = len(self.vocab) # number of unique words in the docs W2 = len(wids2) if outputToScreen: out = self.genOutputter(1) else: out = self.genOutputter(2) out("") out("Em:\n%s\n" % Em) out("Topic magnitudes:") topicMagnitudes = np.linalg.norm(self.T, axis=1) out(topicMagnitudes) out("") # selected tids to output selTids = tids[:cut_i] selTids = np.array(selTids) # always output topic 0 # if topic 0 is not in selTids, append it if len(np.where(selTids == 0)[0]) == 0: selTids = np.append(selTids, 0) for k in selTids: out("Topic %d (%.2f): %.1f%%" % (k, np.linalg.norm(self.T[k]), 100 * Em[k] / self.totalL)) rowID_sorted = sorted( range(W2), key=lambda rowID: row_topicsDampedProp[rowID, k], reverse=True) out("Most relevant words:") most_relevant_words = [] line = "" for rowID in rowID_sorted[:self.topW]: wid = wids2[rowID] topicDampedProp = row_topicsDampedProp[rowID, k] topicProp = row_topicsProp[rowID, k] sim = wids_topics_sim[rowID, k] dotprod = wids_topics_dot[rowID, k] most_relevant_words.append(self.vocab[wid]) line += "%s (%d,%d): %.2f/%.2f/%.2f/%.2f " % ( self.vocab[wid], wid, self.wid2freq[wid], topicDampedProp, topicProp, sim, dotprod) out(line) #calculate and print palmetto scores out("topic coherence for relevant words") all_coherence_types = ["ca", "cp", "cv", "npmi", "uci", "umass"] coherence_string = "" # convert sentences into a single bagofwords lmtzr = WordNetLemmatizer() # words = [ word.split('-') for word in most_relevant_words] most_relevant_words = [ lmtzr.lemmatize(w) for w in most_relevant_words if len(w) > 1 if w not in stopwords.words('english') ] print(most_relevant_words) for score_type in all_coherence_types: try: print('trying coherence') coherence = self.palmetto.get_coherence( most_relevant_words, coherence_type=score_type) print('coherence for ' + score_type) print(coherence) except: print("coherence didn't work") coherence = 0 else: coherence_string += "(%s : %.3f) , " % (score_type, coherence) out(coherence_string) if np.linalg.norm(self.T[k]) == 0: continue V_topic_dot = np.dot(self.V2, self.T[k]) V_topic_sim = V_topic_dot / np.linalg.norm( self.V2, axis=1) / np.linalg.norm(self.T[k]) wid_sorted = sorted(xrange(self.Mstep_sample_topwords), key=lambda wid: V_topic_sim[wid], reverse=True) out("Most similar words in vocab:") line = "" for wid in wid_sorted[:self.topW]: sim = V_topic_sim[wid] dotprod = V_topic_dot[wid] line += "%s: %.2f/%.2f " % (self.vocab[wid], sim, dotprod) out(line) out("") def docSentences2wids(self, docs_wordsInSentences): docs_wids = [] docs_idx = [] countedWC = 0 outvocWC = 0 stopwordWC = 0 wid2freq = {} wids_freq = np.zeros(self.vocab_size) for d, wordsInSentences in enumerate(docs_wordsInSentences): wids = [] for sentence in wordsInSentences: for w in sentence: if self.remove_stop and w in stopwordDict: stopwordWC += 1 continue if w in self.vocab_dict: wid = self.vocab_dict[w] wids.append(wid) wids_freq[wid] += 1 if wid not in wid2freq: wid2freq[wid] = 1 else: wid2freq[wid] += 1 countedWC += 1 else: outvocWC += 1 # skip empty documents if len(wids) > 0: docs_wids.append(wids) docs_idx.append(d) # out0 prints both to screen and to log file, regardless of the verbose level out0 = self.genOutputter(0) out1 = self.genOutputter(1) out0( "%d docs scanned, %d kept. %d words kept, %d unique. %d stop words, %d out voc" % (len(docs_wordsInSentences), len(docs_idx), countedWC, len(wid2freq), stopwordWC, outvocWC)) wid_freqs = sorted(wid2freq.items(), key=lambda kv: kv[1], reverse=True) out1("Top words:") line = "" for wid, freq in wid_freqs[:30]: line += "%s(%d): %d " % (self.vocab[wid], wid, freq) out1(line) return docs_idx, docs_wids, wid2freq, wids_freq def setDocs(self, docs_wordsInSentences, docs_name): self.totalL = 0 self.docs_L = [] self.docs_name = [] self.docs_idx, self.docs_wids, self.wid2freq, self.wids_freq = \ self.docSentences2wids(docs_wordsInSentences) for doc_idx in self.docs_idx: self.docs_name.append(docs_name[doc_idx]) for wids in self.docs_wids: self.docs_L.append(len(wids)) self.totalL = sum(self.docs_L) avgV = np.zeros(self.N0) sum_freq = 0 for wid, freq in self.wid2freq.iteritems(): avgV += self.V[wid] * freq sum_freq += freq avgV /= sum_freq norm_avgV = np.linalg.norm(avgV) print "Norm of avg vector: %.2f" % norm_avgV if self.rebase_vecs and norm_avgV >= self.rebase_norm_thres: self.V -= avgV # update the precomputed matrices/vectors self.precompute() # if self.useLocalU: # self.local_u = self.wids_freq / self.totalL # assert abs( np.sum(self.local_u) - 1 ) < 1e-5, \ # "Local unigram empirical prob vector local_u wrongly normalized: sum=%.3f != 1" %np.sum(self.local_u) self.D = len(self.docs_name) if self.D == 0: print "WARN: Document set is empty after preprocessing." if self.D == 1: self.docsName = "'%s'" % (docs_name[0]) else: self.docsName = "'%s'...(%d docs)" % (docs_name[0], self.D) return self.docs_idx def kmeans(self, maxiter=10): """ centers, Xtocentre, distances = topicvec.kmeans( ... ) in: X: M x N0 centers K x N0: initial centers, e.g. random.sample( X, K ) iterate until the change of the average distance to centers is within topicDiff_tolerance of the previous average distance maxiter metric: cosine self.verbose: 0 silent, 2 prints running distances out: centers, K x N0 Xtocentre: each X -> its nearest center, ints M -> K distances, M """ wids2 = self.wid2freq.keys() weights = np.array(self.wid2freq.values()) X = normalizeF(self.V[wids2]) centers = randomsample(X, self.K) if self.verbose: print "kmeans: X %s centers %s tolerance=%.2g maxiter=%d" % ( X.shape, centers.shape, self.topicDiff_tolerance, maxiter) M = X.shape[0] allx = np.arange(M) prevdist = 0 for jiter in range(1, maxiter + 1): D = cdist(X, centers, metric='cosine') # |X| x |centers| xtoc = D.argmin(axis=1) # X -> nearest center distances = D[allx, xtoc] #avdist = distances.mean() # median ? avdist = (distances * weights).sum() / weights.sum() if self.verbose >= 2: print "kmeans: av |X - nearest center| = %.4g" % avdist if (1 - self.topicDiff_tolerance) * prevdist <= avdist <= prevdist \ or jiter == maxiter: break prevdist = avdist for jc in range(self.K): # (1 pass in C) c = np.where(xtoc == jc)[0] if len(c) > 0: centers[jc] = (X[c] * weights[c, None]).mean(axis=0) if self.verbose: print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount( xtoc) if self.verbose >= 2: r50 = np.zeros(self.K) r90 = np.zeros(self.K) for j in range(self.K): dist = distances[xtoc == j] if len(dist) > 0: r50[j], r90[j] = np.percentile(dist, (50, 90)) print "kmeans: cluster 50% radius", r50.astype(int) print "kmeans: cluster 90% radius", r90.astype(int) self.T = centers self.kmeans_xtoc = xtoc self.kmeans_distances = distances def inferTopicProps(self, T, MAX_ITERS=5): self.T = T self.r = self.calcTopicResiduals(T) # uniform prior self.docs_theta = np.ones((self.D, self.K)) loglike = 0 for i in xrange(MAX_ITERS): iterStartTime = time.time() print('1') docs_Pi2 = self.docs_Pi self.docs_Pi = self.updatePi(self.docs_theta) self.updateTheta() self.calcSum_pi_v() if i > 0: docs_Pi_diff = np.zeros(self.D) for d in xrange(self.D): docs_Pi_diff[d] = np.linalg.norm(self.docs_Pi[d] - docs_Pi2[d]) max_Pi_diff = np.max(docs_Pi_diff) total_Pi_diff = np.sum(docs_Pi_diff) else: max_Pi_diff = 0 total_Pi_diff = 0 print('2') iterDur = time.time() - iterStartTime loglike = self.calcLoglikelihood() print('3') print "Iter %d loglike %.2f, Pi diff total %.3f, max %.3f. %.1fs" % ( i, loglike, total_Pi_diff, max_Pi_diff, iterDur) docs_Em = np.zeros((self.D, self.K)) for d, Pi in enumerate(self.docs_Pi): docs_Em[d] = np.sum(Pi, axis=0) return docs_Em, self.docs_Pi def inference(self): if self.D == 0: print "document set is empty or uninitialized" return None, None, None, None startTime = time.time() startTimeStr = timeToStr(startTime) # out0 prints both to screen and to log file, regardless of the verbose level out0 = self.genOutputter(0) out1 = self.genOutputter(1) out0("%d topics." % (self.K)) out0("%s inference starts at %s" % (self.docsName, startTimeStr)) self.T = np.zeros((self.K, self.N0)) if self.seed != 0: np.random.seed(self.seed) out0("Seed: %d" % self.seed) for k in xrange(0, self.K): self.T[k] = np.random.randn(self.N0) if self.init_l > 0: self.T[k] = self.init_l * normalizeF(self.T[k]) if self.zero_topic0: self.T[0] = np.zeros(self.N0) # sum_v = np.zeros(N0) # for wid in wids: # sum_v += V[wid] # # T[0] = self.max_l * normalizeF(sum_v) #self.fileLogger.debug("avg_v:") #self.fileLogger.debug(T[0]) self.r = self.calcTopicResiduals(self.T) # initialized as uniform over topics self.docs_theta = np.ones((self.D, self.K)) lastIterEndTime = time.time() print "Initial learning rate: %.2f" % (self.iniDelta) self.docs_Pi = self.updatePi(self.docs_theta) self.updateTheta() self.calcSum_pi_v() loglike = self.calcLoglikelihood() self.it = 0 iterDur = time.time() - lastIterEndTime lastIterEndTime = time.time() print "Iter %d: loglike %.2f, %.1fs" % (self.it, loglike, iterDur) # an arbitrary number to satisfy pylint topicDiffNorm = 100000 unif_docs_theta = np.ones((self.D, self.K)) Ts_loglikes = [] while self.it == 0 or (self.it < self.MAX_EM_ITERS and topicDiffNorm > self.topicDiff_tolerance): self.it += 1 self.fileLogger.debug("EM Iter %d:", self.it) self.delta = self.iniDelta / (self.it + 1) # T, r not updated inside updateTopicEmbeddings() # because sometimes we want to keep the original T, r self.T, self.r, topicDiffNorm, maxTStep = self.updateTopicEmbeddings( ) if self.it % self.VStep_iterNum == 0: # does it matter to swap updatePi() & updateTheta()? self.docs_Pi = self.updatePi(self.docs_theta) self.updateTheta() # calcSum_pi_v() takes a long time on a large corpus # so it can be done once every a few iters, with slight loss of performance # on 20news and reuters, calcSum_pi_v() is fast enough and this acceleration is unnecessary if self.it <= 5 or self.it == self.MAX_EM_ITERS or self.it % self.calcSum_pi_v_iterNum == 0: self.calcSum_pi_v() loglike = self.calcLoglikelihood() iterDur = time.time() - lastIterEndTime lastIterEndTime = time.time() iterStatusMsg = "Iter %d: loglike %.2f, topicDiffNorm %.4f, maxTStep %.3f, %.1fs" % ( self.it, loglike, topicDiffNorm, maxTStep, iterDur) if self.it % self.printTopics_iterNum == 0: out0(iterStatusMsg) if self.verbose >= 2: self.fileLogger.debug("T[:,%d]:", self.topDim) self.fileLogger.debug(self.T[:, :self.topDim]) self.fileLogger.debug("r:") self.fileLogger.debug(self.r) self.printTopWordsInTopics(self.docs_theta, False) else: # not using out0 because the "\r" in the console output shouldn't be in the log file print "%s \r" % iterStatusMsg, self.fileLogger.debug(iterStatusMsg) Em = self.calcEm(self.docs_Pi) self.fileLogger.debug("Em:\n%s\n", Em) Ts_loglikes.append([self.it, self.T, loglike]) if self.verbose >= 1: # if == 0, topics has just been printed in the while loop if self.it % self.printTopics_iterNum != 0: #self.printTopWordsInTopics(unif_docs_theta, False) self.printTopWordsInTopics(self.docs_theta, False) endTime = time.time() endTimeStr = timeToStr(endTime) inferDur = int(endTime - startTime) print out0("%s inference ends at %s. %d iters, %d seconds." % (self.docsName, endTimeStr, self.it, inferDur)) # Em: the global (all documents) distribution of topic mass Em = self.calcEm(self.docs_Pi) # docs_Em: the document-wise distribution of topic mass docs_Em = np.zeros((self.D, self.K)) for d, Pi in enumerate(self.docs_Pi): docs_Em[d] = np.sum(Pi, axis=0) # sort according to loglike Ts_loglikes_sorted = sorted(Ts_loglikes, key=lambda T_loglike: T_loglike[2], reverse=True) # best T could be the last T. # In that case, the two elements in best_last_Ts are the same best_last_Ts = [Ts_loglikes_sorted[0], Ts_loglikes[-1]] return best_last_Ts, Em, docs_Em, self.docs_Pi
def __init__(self, models, dirname, num_topics, threaded, email_network, order, A): QtWidgets.QMainWindow.__init__(self) self.setupUi(self) if email_network.frequent_filer: self.i = -9 elif email_network.temporally_sound: self.i = -6 else: self.i = -3 self.all_models = models self.dirname = dirname self.num_topics = num_topics self.threaded = threaded self.email_network = email_network self.order = order self.wordspertopic = 10 self.states = [0] * 3 self.user_phase = True self.mapping = [0, 1, 2] self.selected_items = [[]] * 3 self.headings = [""] * 3 self.added_words = [""] * 3 for model in self.all_models: semantic.umass_coherence(dirname, model.topic_tuples, model, A=A, numwords=10) orders = self.select_topics() # rep1, rep2, rep3 = self.topics_order() # self.rep_order = [rep1, rep2, rep3] self.rep_order = orders for idx, model in enumerate(self.all_models): # noinspection PyUnresolvedReferences model.representative_topic_tuples = [model.topic_tuples[index] for index in self.rep_order[idx]] model.representative_topics_umass_pmi = [model.representative_topics_umass_pmi[index] for index in self.rep_order[idx]] # semantic.umass_coherence(dirname, model.representative_topic_tuples, model, numwords=10) semantic.w2v_coherences(model, email_network.wordvec_dict, num_topics) palmetto = Palmetto() semantic.other_coherences(palmetto, self.all_models, self.num_topics, numwords=10) self.listWidgets = [self.listWidget_1, self.listWidget_2, self.listWidget_3] self.showNext10PushButtons = [self.showNext10PushButton_1, self.showNext10PushButton_2, self.showNext10PushButton_3] self.showPrevious10PushButtons = [self.showPrevious10PushButton_1, self.showPrevious10PushButton_2, self.showPrevious10PushButton_3] self.buttonGroups = [self.buttonGroup_1, self.buttonGroup_2, self.buttonGroup_3] self.buttonGroupsThematic = [self.buttonGroup_4, self.buttonGroup_5, self.buttonGroup_6] self.allButtonGroups = self.buttonGroups + self.buttonGroupsThematic self.comboBoxes = [self.comboBox_1, self.comboBox_2, self.comboBox_3] path = os.path.dirname(os.path.abspath(__file__)) + "/resources/icon.png" self.setWindowIcon(QtGui.QIcon(path)) self.fontSpinBox.valueChanged.connect(self.change_font_size) path = os.path.dirname(os.path.abspath(__file__)) + "/resources/bkdimage.jpg" try: self.bkdLabel.setPixmap(QtGui.QPixmap(path)) self.bkdLabel.setScaledContents(True) except AttributeError: pass self.nextCommandLinkButton.clicked.connect(self.show_next_topic) # TODO: Fix this, using lambda in for loop is not working as it takes the final value of i for every button self.showNext10PushButtons[0].clicked.connect(lambda: self.show_next_10(0)) self.showNext10PushButtons[1].clicked.connect(lambda: self.show_next_10(1)) self.showNext10PushButtons[2].clicked.connect(lambda: self.show_next_10(2)) self.showPrevious10PushButtons[0].clicked.connect(lambda: self.show_previous_10(0)) self.showPrevious10PushButtons[1].clicked.connect(lambda: self.show_previous_10(1)) self.showPrevious10PushButtons[2].clicked.connect(lambda: self.show_previous_10(2)) # for buttonGroup in self.buttonGroups: # buttonGroup.buttonClicked.connect(self.groupbutton_clicked) # for buttonGroup in self.buttonGroupsThematic: # buttonGroup.buttonClicked.connect(self.groupbutton_clicked) for buttonGroup in self.allButtonGroups: buttonGroup.buttonClicked.connect(self.groupbutton_clicked) for i in range(len(self.states)): self.update_gui(i)
from palmettopy.palmetto import Palmetto import numpy as np # palmetto.get_coherence(words, coherence_type="cv") # The available coherence types are "ca", "cp", "cv", "npmi", "uci", and "umass". palmetto = Palmetto() print('ARX LDA Okapi LV') topics = [[ 'flow', 'simulation', 'velocity', 'fluid', 'dynamic', 'numerical', 'force', 'particle', 'model', 'plasma' ], [ 'collision', 'production', 'section', '0', 'cross', 'energy', 'decay', 'gev', 'experiment', 'proton' ], [ 'network', 'model', 'social', 'biological', 'individual', 'population', 'dynamic', 'human', 'cell', 'activity' ], [ 'algorithm', 'method', 'propose', 'learn', 'performance', 'task', 'image', 'art', 'network', 'base' ], [ 'equation', 'function', 'solution', 'operator', 'matrix', 'prove', 'obtain', 'integral', 'space', 'condition' ], [
from palmettopy.palmetto import Palmetto words = [ "cherry", "pie", "cr_eam", "apple", "orange", "banana", "pineapple", "plum", "pig" ] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis', 'foundation_garment', 'initiation'] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis', 'foundation_garment'] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis'] palmetto = Palmetto() #palmetto.get_df_for_words(words) #print(palmetto.get_coherence_fast(words)) print(palmetto.get_coherence(words))
def test_get_coherence(capsys, words): palmetto = Palmetto() coherence = palmetto.get_coherence(words) assert (coherence == 0.5678879445677241)
def test_get_coherence_fast(capsys, words): palmetto = Palmetto() coherence = palmetto.get_coherence_fast(words) assert (coherence == 1779.6591356383024)
def test_wrong_endpoint(words): palmetto = Palmetto("http://example.com/nothinghere/") with pytest.raises(EndpointDown): coherence = palmetto.get_coherence(words)
#print(top_words) #print('n before : ', len(top_words)) flag = 0 while (flag == 0): try: palmetto = Palmetto() score = palmetto.get_coherence(top_words) flag = 1 #print ('n after : ', len(top_words)) except EndpointDown: top_words = top_words[:len(top_words) - 1] return score if __name__ == "__main__": palmetto = Palmetto() #words = [u'real', u'task', u'algorithm', u'tasks', u'dynamic', u'periodic', u'systems', u'time', u'scheduling', u'problem', u'model'] words = [u'real', u'task', u'algorithm', u'tasks', u'dynamic'] words = [ u'real', u'task', u'algorithm', u'tasks', u'dynamic', u'periodic', u'systems' ] words = [ u'real', u'task', u'algorithm', u'tasks', u'dynamic', u'periodic', u'systems', u'time' ] words = [ u'real', u'task', u'algorithm', u'tasks', u'dynamic', u'periodic', u'systems', u'time', u'scheduling' ] words = [
def test_all_coherence_types(words): palmetto = Palmetto() for coherence_type in palmetto.all_coherence_types: palmetto.get_coherence(words, coherence_type=coherence_type)
from palmettopy.palmetto import Palmetto import numpy as np # palmetto.get_coherence(words, coherence_type="cv") # The available coherence types are "ca", "cp", "cv", "npmi", "uci", and "umass". palmetto = Palmetto() # LDA TFIDF LV topics = [[ 'problem', 'algorithm', 'bound', 'graph', 'time', '1', '2', 'approximation', 'show', 'log' ], [ 'classification', 'learn', 'data', 'learning', 'rule', 'classifier', 'neural', 'base', 'network', 'discovery' ], [ 'data', 'query', 'database', 'system', 'propose', 'paper', 'performance', 'base', 'present', 'show' ], [ 'image', 'method', 'model', 'surface', 'base', '3d', 'motion', 'object', 'present', 'shape' ], [ 'research', 'system', 'information', 'paper', 'technology', 'data', 'medical', 'application', 'knowledge', 'web' ]] for words in topics:
def test_wrong_content_type(words): palmetto = Palmetto() with pytest.raises(WrongContentType): palmetto._request_by_service(words, "cv", "bla")
def test_all_content_types(words): palmetto = Palmetto() for content_type in ["text", "bytes"]: palmetto._request_by_service(words, "umass", content_type)
def test_get_df_for_words(words): palmetto = Palmetto() doc_ids = palmetto.get_df_for_words(words) for i in range(0, len(words)): assert (doc_ids[i][0] == words[i])
import sys import pdb import os import getopt from corpusLoader import * from utils import * from topicvecDir import topicvecDir import yaml from palmettopy.palmetto import Palmetto palmetto = Palmetto() def usage(): print """Usage: modify or create your own yml file to change configurations example usage >> python topicExp.py config.yml """ try: opts, args = getopt.getopt(sys.argv[1:], "i:t:wso") if len(args) == 0: raise getopt.GetoptError("Not enough free arguments") if len(args) > 1: raise getopt.GetoptError("Too many free arguments") yml_file_path = args[0] except getopt.GetoptError, e:
from palmettopy.palmetto import Palmetto words = [ "cherry", "pie", "cr_eam", "apple", "orange", "banana", "pineapple", "plum", "pig", "cra_cker", "so_und", "kit" ] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis', 'foundation_garment', 'initiation'] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis', 'foundation_garment'] #words = ['label', 'type', 'character', 'subject', 'discipline', 'topic', 'national', 'home_page', 'foundation', 'basis'] palmetto = Palmetto() palmetto.get_df_for_words(words) #print(palmetto.get_coherence_fast(words)) #print(palmetto.get_coherence(words))