def test_get_df_for_words_with_no_results(words_no_results): """ This test case fails for some unknown reason Fails. Palmetto can not handle underscores. """ palmetto = Palmetto() doc_ids = palmetto.get_df_for_words(words_no_results) for i in range(0, len(words_no_results)): assert (doc_ids[i][0] == words_no_results[i])
def writeCoherence(topicWords4Epochs, path, mixedFile='', command=[False,False], info=''): file = os.path.join(path, '{}.txt'.format(info)) if command[0]: CohWriterCV = SummaryWriter(os.path.join(path,'runs/coh{}_cv'.format(info))) corpus, text, id2word = buildCorpusDict(mixedFile) if command[1]: CohWriterUmass = SummaryWriter(os.path.join(path,'runs/coh{}_umass'.format(info))) pmt = Palmetto() widgets = ['writing {}: '.format(info), Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=(len(topicWords4Epochs))) pbar.start() for i, topicWords in enumerate(topicWords4Epochs): if command[0]: try: cm = CoherenceModel(topics=topicWords, corpus=corpus, texts=text, dictionary=id2word, coherence='c_v') coherence = cm.get_coherence() CohWriterCV.add_scalar('coherenceCV', coherence, i) coherences = cm.get_coherence_per_topic() except: coherence=0; coherences=0 else: coherence=0; coherences=[0 for i in range(len(topicWords))] if command[1]: try: coherences2 = [] for topic in topicWords: coherence = pmt(topic, coherence_type='umass') coherences2.append(coherence) coherence2 = sum(coherences2)/len(coherences2) CohWriterUmass.add_scalar('coherenceUMASS', coherence2 , i) coherenceList = [coherences, coherence, coherences2, coherence2] except: coherence2=0; coherences2=[0 for i in range(len(topicWords))] else: coherence2=0; coherences2=0 coherenceList = [coherences, coherence, coherences2, coherence2] writeTWC(topicWords, coherenceList, file, 'article', i, command) pbar.update(i+1) pbar.finish() return
def build_weighted_graph(synset_packs): """ Build a weighted graph out of synset packs. Return a list of tuples with weights such as: [ [((item_a, item_b), 1.534), (item_a, item_c), 1.1234], [((item_c, item_d), 1.34), (item_c, item_f), 1.24], ... ] """ # get all document frequencies # collect all terms words = [] for i in range(0, len(synset_packs)): for j in range(0, len(synset_packs[i][1])): synset_packs[i][1][j] = synset_packs[i][1][j].name().split(".")[0] words.append(synset_packs[i][1][j]) if len(synset_packs[i][1]) == 0: synset_packs[i][1].append(synset_packs[i][0]) words.append(synset_packs[i][0]) palmetto = Palmetto() doc_id_tuples = palmetto.get_df_for_words(words) doc_id_tuples_dict = dict(doc_id_tuples) edges = [] for i in range(0, len(synset_packs)): for j in range(i + 1, len(synset_packs)): edge = [] for that_word in synset_packs[j][1]: for this_word in synset_packs[i][1]: edge_item = ((this_word, that_word), calculate_coherence(this_word, that_word, doc_id_tuples_dict)) edge.append(edge_item) edges.append(sorted(edge, key=lambda x: x[1], reverse=True)) return edges
def coherence_v(keyphrases): top_words = [] for phrase in keyphrases: clean_phrase = preprocess_text(phrase) words = clean_phrase.split(' ') top_words.extend(words) top_words = list(set(top_words)) #CEK KOHERENSI TOPIK PER FRASE top_words = keyphrases #print(top_words) #print('n before : ', len(top_words)) flag = 0 while (flag == 0): try: palmetto = Palmetto() score = palmetto.get_coherence(top_words) flag = 1 #print ('n after : ', len(top_words)) except EndpointDown: top_words = top_words[:len(top_words) - 1] return score
def cluster_header_random(header): """ Cluster synsets using palmetto. Randomly select permutation of the header and calculate coherence. Repeat until algorithm converges. """ palmetto = Palmetto() synsets_pack = get_header_synsets(header) window_size = 3 window = [] maximum_coherence = 0 index = 0 no_change = 0 best_permutation = [] while True: random_permutation = _pick_random_synset_permutation(synsets_pack) coherence = palmetto.get_coherence(random_permutation) window.append((random_permutation, coherence)) if index % window_size == 0: (local_best_permutation, local_maximum_coherence) = max(window, key=lambda x: x[1]) if local_maximum_coherence > maximum_coherence: maximum_coherence = local_maximum_coherence best_permutation = local_best_permutation else: no_change = no_change + 1 window = [] if no_change > 2: break index = index + 1 return best_permutation
import sys import pdb import os import getopt from corpusLoader import * from utils import * from topicvecDir import topicvecDir import yaml from palmettopy.palmetto import Palmetto palmetto = Palmetto() def usage(): print """Usage: modify or create your own yml file to change configurations example usage >> python topicExp.py config.yml """ try: opts, args = getopt.getopt(sys.argv[1:], "i:t:wso") if len(args) == 0: raise getopt.GetoptError("Not enough free arguments") if len(args) > 1: raise getopt.GetoptError("Too many free arguments") yml_file_path = args[0] except getopt.GetoptError, e:
def test_get_df_for_words(words): palmetto = Palmetto() doc_ids = palmetto.get_df_for_words(words) for i in range(0, len(words)): assert (doc_ids[i][0] == words[i])
def test_wrong_content_type(words): palmetto = Palmetto() with pytest.raises(WrongContentType): palmetto._request_by_service(words, "cv", "bla")
def test_all_content_types(words): palmetto = Palmetto() for content_type in ["text", "bytes"]: palmetto._request_by_service(words, "umass", content_type)
def test_wrong_coherence_type(words): palmetto = Palmetto() with pytest.raises(CoherenceTypeNotAvailable): coherence = palmetto.get_coherence(words, coherence_type="asdf")
def test_all_coherence_types(words): palmetto = Palmetto() for coherence_type in palmetto.all_coherence_types: palmetto.get_coherence(words, coherence_type=coherence_type)
def test_wrong_endpoint(words): palmetto = Palmetto("http://example.com/nothinghere/") with pytest.raises(EndpointDown): coherence = palmetto.get_coherence(words)
def test_get_coherence_fast(capsys, words): palmetto = Palmetto() coherence = palmetto.get_coherence_fast(words) assert (coherence == 1779.6591356383024)
def test_get_coherence(capsys, words): palmetto = Palmetto() coherence = palmetto.get_coherence(words) assert (coherence == 0.5678879445677241)
def __init__(self, models, dirname, num_topics, threaded, email_network, order, A): QtWidgets.QMainWindow.__init__(self) self.setupUi(self) if email_network.frequent_filer: self.i = -9 elif email_network.temporally_sound: self.i = -6 else: self.i = -3 self.all_models = models self.dirname = dirname self.num_topics = num_topics self.threaded = threaded self.email_network = email_network self.order = order self.wordspertopic = 10 self.states = [0] * 3 self.user_phase = True self.mapping = [0, 1, 2] self.selected_items = [[]] * 3 self.headings = [""] * 3 self.added_words = [""] * 3 for model in self.all_models: semantic.umass_coherence(dirname, model.topic_tuples, model, A=A, numwords=10) orders = self.select_topics() # rep1, rep2, rep3 = self.topics_order() # self.rep_order = [rep1, rep2, rep3] self.rep_order = orders for idx, model in enumerate(self.all_models): # noinspection PyUnresolvedReferences model.representative_topic_tuples = [model.topic_tuples[index] for index in self.rep_order[idx]] model.representative_topics_umass_pmi = [model.representative_topics_umass_pmi[index] for index in self.rep_order[idx]] # semantic.umass_coherence(dirname, model.representative_topic_tuples, model, numwords=10) semantic.w2v_coherences(model, email_network.wordvec_dict, num_topics) palmetto = Palmetto() semantic.other_coherences(palmetto, self.all_models, self.num_topics, numwords=10) self.listWidgets = [self.listWidget_1, self.listWidget_2, self.listWidget_3] self.showNext10PushButtons = [self.showNext10PushButton_1, self.showNext10PushButton_2, self.showNext10PushButton_3] self.showPrevious10PushButtons = [self.showPrevious10PushButton_1, self.showPrevious10PushButton_2, self.showPrevious10PushButton_3] self.buttonGroups = [self.buttonGroup_1, self.buttonGroup_2, self.buttonGroup_3] self.buttonGroupsThematic = [self.buttonGroup_4, self.buttonGroup_5, self.buttonGroup_6] self.allButtonGroups = self.buttonGroups + self.buttonGroupsThematic self.comboBoxes = [self.comboBox_1, self.comboBox_2, self.comboBox_3] path = os.path.dirname(os.path.abspath(__file__)) + "/resources/icon.png" self.setWindowIcon(QtGui.QIcon(path)) self.fontSpinBox.valueChanged.connect(self.change_font_size) path = os.path.dirname(os.path.abspath(__file__)) + "/resources/bkdimage.jpg" try: self.bkdLabel.setPixmap(QtGui.QPixmap(path)) self.bkdLabel.setScaledContents(True) except AttributeError: pass self.nextCommandLinkButton.clicked.connect(self.show_next_topic) # TODO: Fix this, using lambda in for loop is not working as it takes the final value of i for every button self.showNext10PushButtons[0].clicked.connect(lambda: self.show_next_10(0)) self.showNext10PushButtons[1].clicked.connect(lambda: self.show_next_10(1)) self.showNext10PushButtons[2].clicked.connect(lambda: self.show_next_10(2)) self.showPrevious10PushButtons[0].clicked.connect(lambda: self.show_previous_10(0)) self.showPrevious10PushButtons[1].clicked.connect(lambda: self.show_previous_10(1)) self.showPrevious10PushButtons[2].clicked.connect(lambda: self.show_previous_10(2)) # for buttonGroup in self.buttonGroups: # buttonGroup.buttonClicked.connect(self.groupbutton_clicked) # for buttonGroup in self.buttonGroupsThematic: # buttonGroup.buttonClicked.connect(self.groupbutton_clicked) for buttonGroup in self.allButtonGroups: buttonGroup.buttonClicked.connect(self.groupbutton_clicked) for i in range(len(self.states)): self.update_gui(i)
def __init__(self, **kwargs): print kwargs self.palmetto = Palmetto() self.output_folder_path = kwargs.get('output_folder_path', "output/") self.output_file_name = kwargs.get('output_file_name', "results") self.vocab_file = kwargs.get('vocab_file', "top1grams-wiki.txt") self.word_vec_file = kwargs.get('word_vec_file', "25000-500-EM.vec") self.topic_vec_file = kwargs.get('topic_vec_file', None) self.W = kwargs.get('load_embedding_word_count', -1) K = kwargs.get('K', 30) self.max_l = kwargs.get('max_l', 5) self.init_l = kwargs.get('init_l', 1) self.max_grad_norm = kwargs.get('max_grad_norm', 1.0) self.max_grad_norm_fraction = kwargs.get('max_grad_norm_fraction', 0.2) self.grad_scale_Em_base = kwargs.get('grad_scale_Em_base', 0) # number of top words to output into logfile self.topW = kwargs.get('topW', 12) # output the first 'topDim' dimensions of T, for debugging self.topDim = kwargs.get('topDim', 10) self.topTopicMassFracPrintThres = kwargs.get( 'topTopicMassFracPrintThres', 1) # Dirichlet parameter for the null topic self.alpha0 = kwargs.get('alpha0', 5) # Dirichlet parameter for all other topics self.alpha1 = kwargs.get('alpha1', 1) # initial learning rate self.delta = self.iniDelta = kwargs.get('iniDelta', 0.1) self.MAX_EM_ITERS = kwargs.get('MAX_EM_ITERS', 200) self.topicDiff_tolerance = kwargs.get('topicDiff_tolerance', 1e-2) # whether fix topic 0 to null topic self.zero_topic0 = kwargs.get('zero_topic0', True) self.appendLogfile = kwargs.get('appendLogfile', False) self.customStopwords = kwargs.get('customStopwords', "") self.remove_stop = kwargs.get('remove_stop', True) self.seed = kwargs.get('seed', 0) self.verbose = kwargs.get('verbose', 1) # print topics every so many iters self.printTopics_iterNum = kwargs.get('printTopics_iterNum', 20) # compute sum_pi_v is slow. Approximate it by calculating it every few iters to speed up self.calcSum_pi_v_iterNum = kwargs.get('calcSum_pi_v_iterNum', 1) # do V-step every few M-steps to speed up. Default: 1 (each M-step) self.VStep_iterNum = kwargs.get('VStep_iterNum', 1) self.calcLike_iterNum = kwargs.get('calcLike_iterNum', 1) self.useDrdtApprox = kwargs.get('useDrdtApprox', False) self.Mstep_sample_topwords = kwargs.get('Mstep_sample_topwords', 0) self.normalize_vecs = kwargs.get('normalize_vecs', False) self.rebase_vecs = kwargs.get('rebase_vecs', False) self.rebase_norm_thres = kwargs.get('rebase_norm_thres', 0) self.evalKmeans = kwargs.get('evalKmeans', False) self.D = 0 self.docsName = "Uninitialized" self.vocab_dict = loadVocabFile(self.vocab_file) embedding_file_name = self.word_vec_file.rsplit('/', 1)[1] embedding_npyfile = self.output_folder_path + embedding_file_name + '.npy' self.V, self.vocab, self.word2ID, skippedWords_whatever = load_embeddings( self.word_vec_file, self.W) embedding_arrays = np.array( [self.V, self.vocab, self.word2ID, skippedWords_whatever]) np.save(embedding_npyfile, embedding_arrays) # map of word -> id of all words with embeddings vocab_dict2 = {} if self.normalize_vecs: self.V = normalizeF(self.V) # dimensionality of topic/word embeddings self.N0 = self.V.shape[1] # number of all words self.vocab_size = self.V.shape[0] # set unigram probs u2 = [] oovcount = 0 unigram_oov_prior = 0.000001 for wid, w in enumerate(self.vocab): if w not in self.vocab_dict: oovcount += 1 u2.append(unigram_oov_prior) else: # u2.append( self.vocab_dict[w][2] ) u2.append(unigram_oov_prior) vocab_dict2[w] = wid if oovcount > 0: print "%d words in '%s' but not in '%s'. Unigram prob set to oov prior %.3g" % ( oovcount, self.word_vec_file, self.vocab_file, unigram_oov_prior) u2 = np.array(u2) self.u = normalize(u2) # structure of vocab_dict changed here. Original vocab_dict is w->[id, freq, unigram_prob] # now vocab_dict is only w->id self.vocab_dict = vocab_dict2 # u2 is the top "Mstep_sample_topwords" words of u, # used for a sampling inference (i.e. only the most # important "Mstep_sample_topwords" words are used) in the M-step # if Mstep_sample_topwords == 0, sampling is disabled if self.Mstep_sample_topwords == 0: self.Mstep_sample_topwords = self.vocab_size self.u2 = self.u self.V2 = self.V else: self.u2 = self.u[:self.Mstep_sample_topwords] self.u2 = normalize(self.u2) self.V2 = self.V[:self.Mstep_sample_topwords] customStopwordList = re.split(r"\s+", self.customStopwords) for stop_w in customStopwordList: stopwordDict[stop_w] = 1 print "Custom stopwords: %s" % (", ".join(customStopwordList)) if 'fileLogger' not in kwargs: self.logfilename = kwargs.get('logfilename', "topicvecDir") self.fileLogger = initFileLogger(self.logfilename, self.appendLogfile) else: self.fileLogger = kwargs['fileLogger'] self.fileLogger.debug("topicvecDir() init at %s", time.ctime()) self.precompute() self.setK(K) self.docs_name = [] self.docs_idx = [] self.docs_wids = [] self.wid2freq = [] self.wids_freq = [] self.expVT = None self.T = self.r = self.sum_pi_v = None self.docs_L = [] self.docs_Pi = [] self.docs_theta = [] self.totalL = 0 self.kmeans_xtoc = self.kmeans_distances = None # current iteration number self.it = 0