Exemple #1
0
def test_get_df_for_words_with_no_results(words_no_results):
    """
        This test case fails for some unknown reason

        Fails. Palmetto can not handle underscores.
    """
    palmetto = Palmetto()
    doc_ids = palmetto.get_df_for_words(words_no_results)
    for i in range(0, len(words_no_results)):
        assert (doc_ids[i][0] == words_no_results[i])
Exemple #2
0
def writeCoherence(topicWords4Epochs, path, mixedFile='', command=[False,False], info=''):
    file = os.path.join(path, '{}.txt'.format(info))
    if command[0]:
        CohWriterCV = SummaryWriter(os.path.join(path,'runs/coh{}_cv'.format(info)))
        corpus, text, id2word = buildCorpusDict(mixedFile)  
    if command[1]:
        CohWriterUmass = SummaryWriter(os.path.join(path,'runs/coh{}_umass'.format(info)))
        pmt = Palmetto() 
    widgets = ['writing {}: '.format(info), Percentage(), ' ', Bar(), ' ', ETA()]
    pbar = ProgressBar(widgets=widgets, maxval=(len(topicWords4Epochs)))
    pbar.start()    
        
    for i, topicWords in enumerate(topicWords4Epochs):
        if command[0]:
            try:
                cm = CoherenceModel(topics=topicWords, corpus=corpus, texts=text, dictionary=id2word, coherence='c_v')
                coherence = cm.get_coherence()
                CohWriterCV.add_scalar('coherenceCV', coherence, i)
                coherences = cm.get_coherence_per_topic() 
            except:
                coherence=0; coherences=0
        else:
            coherence=0; coherences=[0 for i in range(len(topicWords))]
        if command[1]:
            try:
                coherences2 = []
                for topic in topicWords:
                    coherence = pmt(topic, coherence_type='umass')
                    coherences2.append(coherence)
                coherence2 = sum(coherences2)/len(coherences2)
                CohWriterUmass.add_scalar('coherenceUMASS', coherence2 , i)
                coherenceList = [coherences, coherence, coherences2, coherence2]
            except:
                coherence2=0; coherences2=[0 for i in range(len(topicWords))]
        else:
            coherence2=0; coherences2=0
        coherenceList = [coherences, coherence, coherences2, coherence2]
        writeTWC(topicWords, coherenceList, file, 'article', i, command)
        pbar.update(i+1)
    pbar.finish()
    return
def build_weighted_graph(synset_packs):
    """
    Build a weighted graph out of synset packs.

    Return a list of tuples with weights such as:
    [
        [((item_a, item_b), 1.534), (item_a, item_c), 1.1234],
        [((item_c, item_d), 1.34), (item_c, item_f), 1.24],
        ...
    ]
    """
    # get all document frequencies
    # collect all terms
    words = []
    for i in range(0, len(synset_packs)):
        for j in range(0, len(synset_packs[i][1])):
            synset_packs[i][1][j] = synset_packs[i][1][j].name().split(".")[0]
            words.append(synset_packs[i][1][j])
        if len(synset_packs[i][1]) == 0:
            synset_packs[i][1].append(synset_packs[i][0])
            words.append(synset_packs[i][0])

    palmetto = Palmetto()
    doc_id_tuples = palmetto.get_df_for_words(words)
    doc_id_tuples_dict = dict(doc_id_tuples)

    edges = []
    for i in range(0, len(synset_packs)):
        for j in range(i + 1, len(synset_packs)):
            edge = []
            for that_word in synset_packs[j][1]:
                for this_word in synset_packs[i][1]:
                    edge_item = ((this_word, that_word),
                                 calculate_coherence(this_word, that_word,
                                                     doc_id_tuples_dict))
                    edge.append(edge_item)
            edges.append(sorted(edge, key=lambda x: x[1], reverse=True))
    return edges
def coherence_v(keyphrases):
    top_words = []
    for phrase in keyphrases:
        clean_phrase = preprocess_text(phrase)
        words = clean_phrase.split(' ')
        top_words.extend(words)

    top_words = list(set(top_words))

    #CEK KOHERENSI TOPIK PER FRASE
    top_words = keyphrases

    #print(top_words)
    #print('n before : ', len(top_words))
    flag = 0
    while (flag == 0):
        try:
            palmetto = Palmetto()
            score = palmetto.get_coherence(top_words)
            flag = 1
            #print ('n after : ', len(top_words))
        except EndpointDown:
            top_words = top_words[:len(top_words) - 1]
    return score
def cluster_header_random(header):
    """
    Cluster synsets using palmetto.

    Randomly select permutation of the header and
    calculate coherence. Repeat until algorithm converges.
    """
    palmetto = Palmetto()
    synsets_pack = get_header_synsets(header)

    window_size = 3
    window = []
    maximum_coherence = 0
    index = 0
    no_change = 0
    best_permutation = []
    while True:
        random_permutation = _pick_random_synset_permutation(synsets_pack)
        coherence = palmetto.get_coherence(random_permutation)
        window.append((random_permutation, coherence))

        if index % window_size == 0:
            (local_best_permutation,
             local_maximum_coherence) = max(window, key=lambda x: x[1])
            if local_maximum_coherence > maximum_coherence:
                maximum_coherence = local_maximum_coherence
                best_permutation = local_best_permutation
            else:
                no_change = no_change + 1
            window = []

        if no_change > 2:
            break

        index = index + 1
    return best_permutation
Exemple #6
0
import sys
import pdb
import os
import getopt
from corpusLoader import *
from utils import *
from topicvecDir import topicvecDir
import yaml


from palmettopy.palmetto import Palmetto
palmetto = Palmetto()


def usage():
    print """Usage: modify or create your own yml file to change configurations
        example usage >> python topicExp.py config.yml
    """



try:
    opts, args = getopt.getopt(sys.argv[1:], "i:t:wso")

    if len(args) == 0:
        raise getopt.GetoptError("Not enough free arguments")
    if len(args) > 1:
        raise getopt.GetoptError("Too many free arguments")
    yml_file_path = args[0]

except getopt.GetoptError, e:
Exemple #7
0
def test_get_df_for_words(words):
    palmetto = Palmetto()
    doc_ids = palmetto.get_df_for_words(words)
    for i in range(0, len(words)):
        assert (doc_ids[i][0] == words[i])
Exemple #8
0
def test_wrong_content_type(words):
    palmetto = Palmetto()
    with pytest.raises(WrongContentType):
        palmetto._request_by_service(words, "cv", "bla")
Exemple #9
0
def test_all_content_types(words):
    palmetto = Palmetto()
    for content_type in ["text", "bytes"]:
        palmetto._request_by_service(words, "umass", content_type)
Exemple #10
0
def test_wrong_coherence_type(words):
    palmetto = Palmetto()
    with pytest.raises(CoherenceTypeNotAvailable):
        coherence = palmetto.get_coherence(words, coherence_type="asdf")
Exemple #11
0
def test_all_coherence_types(words):
    palmetto = Palmetto()
    for coherence_type in palmetto.all_coherence_types:
        palmetto.get_coherence(words, coherence_type=coherence_type)
Exemple #12
0
def test_wrong_endpoint(words):
    palmetto = Palmetto("http://example.com/nothinghere/")
    with pytest.raises(EndpointDown):
        coherence = palmetto.get_coherence(words)
Exemple #13
0
def test_get_coherence_fast(capsys, words):
    palmetto = Palmetto()
    coherence = palmetto.get_coherence_fast(words)
    assert (coherence == 1779.6591356383024)
Exemple #14
0
def test_get_coherence(capsys, words):
    palmetto = Palmetto()
    coherence = palmetto.get_coherence(words)
    assert (coherence == 0.5678879445677241)
Exemple #15
0
    def __init__(self, models, dirname, num_topics, threaded, email_network, order, A):

        QtWidgets.QMainWindow.__init__(self)
        self.setupUi(self)

        if email_network.frequent_filer:
            self.i = -9
        elif email_network.temporally_sound:
            self.i = -6
        else:
            self.i = -3
        self.all_models = models
        self.dirname = dirname
        self.num_topics = num_topics
        self.threaded = threaded
        self.email_network = email_network
        self.order = order
        self.wordspertopic = 10
        self.states = [0] * 3
        self.user_phase = True
        self.mapping = [0, 1, 2]
        self.selected_items = [[]] * 3
        self.headings = [""] * 3
        self.added_words = [""] * 3
        for model in self.all_models:
            semantic.umass_coherence(dirname, model.topic_tuples, model,  A=A, numwords=10)
        orders = self.select_topics()
        # rep1, rep2, rep3 = self.topics_order()
        # self.rep_order = [rep1, rep2, rep3]
        self.rep_order = orders
        for idx, model in enumerate(self.all_models):
            # noinspection PyUnresolvedReferences
            model.representative_topic_tuples = [model.topic_tuples[index] for index in self.rep_order[idx]]
            model.representative_topics_umass_pmi = [model.representative_topics_umass_pmi[index]
                                                     for index in self.rep_order[idx]]
            # semantic.umass_coherence(dirname, model.representative_topic_tuples, model, numwords=10)
            semantic.w2v_coherences(model, email_network.wordvec_dict, num_topics)
        palmetto = Palmetto()
        semantic.other_coherences(palmetto, self.all_models, self.num_topics, numwords=10)

        self.listWidgets = [self.listWidget_1, self.listWidget_2, self.listWidget_3]
        self.showNext10PushButtons = [self.showNext10PushButton_1, self.showNext10PushButton_2,
                                      self.showNext10PushButton_3]
        self.showPrevious10PushButtons = [self.showPrevious10PushButton_1, self.showPrevious10PushButton_2,
                                          self.showPrevious10PushButton_3]

        self.buttonGroups = [self.buttonGroup_1, self.buttonGroup_2, self.buttonGroup_3]
        self.buttonGroupsThematic = [self.buttonGroup_4, self.buttonGroup_5, self.buttonGroup_6]
        self.allButtonGroups = self.buttonGroups + self.buttonGroupsThematic

        self.comboBoxes = [self.comboBox_1, self.comboBox_2, self.comboBox_3]

        path = os.path.dirname(os.path.abspath(__file__)) + "/resources/icon.png"
        self.setWindowIcon(QtGui.QIcon(path))

        self.fontSpinBox.valueChanged.connect(self.change_font_size)

        path = os.path.dirname(os.path.abspath(__file__)) + "/resources/bkdimage.jpg"
        try:
            self.bkdLabel.setPixmap(QtGui.QPixmap(path))
            self.bkdLabel.setScaledContents(True)
        except AttributeError:
            pass

        self.nextCommandLinkButton.clicked.connect(self.show_next_topic)

        # TODO: Fix this, using lambda in for loop is not working as it takes the final value of i for every button
        self.showNext10PushButtons[0].clicked.connect(lambda: self.show_next_10(0))
        self.showNext10PushButtons[1].clicked.connect(lambda: self.show_next_10(1))
        self.showNext10PushButtons[2].clicked.connect(lambda: self.show_next_10(2))
        self.showPrevious10PushButtons[0].clicked.connect(lambda: self.show_previous_10(0))
        self.showPrevious10PushButtons[1].clicked.connect(lambda: self.show_previous_10(1))
        self.showPrevious10PushButtons[2].clicked.connect(lambda: self.show_previous_10(2))
        # for buttonGroup in self.buttonGroups:
        #     buttonGroup.buttonClicked.connect(self.groupbutton_clicked)
        # for buttonGroup in self.buttonGroupsThematic:
        #     buttonGroup.buttonClicked.connect(self.groupbutton_clicked)
        for buttonGroup in self.allButtonGroups:
            buttonGroup.buttonClicked.connect(self.groupbutton_clicked)
        for i in range(len(self.states)):
            self.update_gui(i)
Exemple #16
0
    def __init__(self, **kwargs):

        print kwargs
        self.palmetto = Palmetto()
        self.output_folder_path = kwargs.get('output_folder_path', "output/")
        self.output_file_name = kwargs.get('output_file_name', "results")
        self.vocab_file = kwargs.get('vocab_file', "top1grams-wiki.txt")
        self.word_vec_file = kwargs.get('word_vec_file', "25000-500-EM.vec")
        self.topic_vec_file = kwargs.get('topic_vec_file', None)
        self.W = kwargs.get('load_embedding_word_count', -1)
        K = kwargs.get('K', 30)

        self.max_l = kwargs.get('max_l', 5)
        self.init_l = kwargs.get('init_l', 1)
        self.max_grad_norm = kwargs.get('max_grad_norm', 1.0)
        self.max_grad_norm_fraction = kwargs.get('max_grad_norm_fraction', 0.2)
        self.grad_scale_Em_base = kwargs.get('grad_scale_Em_base', 0)
        # number of top words to output into logfile
        self.topW = kwargs.get('topW', 12)
        # output the first 'topDim' dimensions of T, for debugging
        self.topDim = kwargs.get('topDim', 10)
        self.topTopicMassFracPrintThres = kwargs.get(
            'topTopicMassFracPrintThres', 1)
        # Dirichlet parameter for the null topic
        self.alpha0 = kwargs.get('alpha0', 5)
        # Dirichlet parameter for all other topics
        self.alpha1 = kwargs.get('alpha1', 1)
        # initial learning rate
        self.delta = self.iniDelta = kwargs.get('iniDelta', 0.1)
        self.MAX_EM_ITERS = kwargs.get('MAX_EM_ITERS', 200)
        self.topicDiff_tolerance = kwargs.get('topicDiff_tolerance', 1e-2)
        # whether fix topic 0 to null topic
        self.zero_topic0 = kwargs.get('zero_topic0', True)
        self.appendLogfile = kwargs.get('appendLogfile', False)
        self.customStopwords = kwargs.get('customStopwords', "")
        self.remove_stop = kwargs.get('remove_stop', True)
        self.seed = kwargs.get('seed', 0)
        self.verbose = kwargs.get('verbose', 1)
        # print topics every so many iters
        self.printTopics_iterNum = kwargs.get('printTopics_iterNum', 20)
        # compute sum_pi_v is slow. Approximate it by calculating it every few iters to speed up
        self.calcSum_pi_v_iterNum = kwargs.get('calcSum_pi_v_iterNum', 1)
        # do V-step every few M-steps to speed up. Default: 1 (each M-step)
        self.VStep_iterNum = kwargs.get('VStep_iterNum', 1)
        self.calcLike_iterNum = kwargs.get('calcLike_iterNum', 1)

        self.useDrdtApprox = kwargs.get('useDrdtApprox', False)
        self.Mstep_sample_topwords = kwargs.get('Mstep_sample_topwords', 0)
        self.normalize_vecs = kwargs.get('normalize_vecs', False)
        self.rebase_vecs = kwargs.get('rebase_vecs', False)
        self.rebase_norm_thres = kwargs.get('rebase_norm_thres', 0)
        self.evalKmeans = kwargs.get('evalKmeans', False)

        self.D = 0
        self.docsName = "Uninitialized"
        self.vocab_dict = loadVocabFile(self.vocab_file)

        embedding_file_name = self.word_vec_file.rsplit('/', 1)[1]

        embedding_npyfile = self.output_folder_path + embedding_file_name + '.npy'

        self.V, self.vocab, self.word2ID, skippedWords_whatever = load_embeddings(
            self.word_vec_file, self.W)
        embedding_arrays = np.array(
            [self.V, self.vocab, self.word2ID, skippedWords_whatever])
        np.save(embedding_npyfile, embedding_arrays)

        # map of word -> id of all words with embeddings
        vocab_dict2 = {}

        if self.normalize_vecs:
            self.V = normalizeF(self.V)

        # dimensionality of topic/word embeddings
        self.N0 = self.V.shape[1]
        # number of all words
        self.vocab_size = self.V.shape[0]

        # set unigram probs
        u2 = []
        oovcount = 0
        unigram_oov_prior = 0.000001
        for wid, w in enumerate(self.vocab):
            if w not in self.vocab_dict:
                oovcount += 1
                u2.append(unigram_oov_prior)
            else:
                # u2.append( self.vocab_dict[w][2] )
                u2.append(unigram_oov_prior)
                vocab_dict2[w] = wid

        if oovcount > 0:
            print "%d words in '%s' but not in '%s'. Unigram prob set to oov prior %.3g" % (
                oovcount, self.word_vec_file, self.vocab_file,
                unigram_oov_prior)

        u2 = np.array(u2)
        self.u = normalize(u2)
        # structure of vocab_dict changed here. Original vocab_dict is w->[id, freq, unigram_prob]
        # now vocab_dict is only w->id
        self.vocab_dict = vocab_dict2
        # u2 is the top "Mstep_sample_topwords" words of u,
        # used for a sampling inference (i.e. only the most
        # important "Mstep_sample_topwords" words are used) in the M-step
        # if Mstep_sample_topwords == 0, sampling is disabled
        if self.Mstep_sample_topwords == 0:
            self.Mstep_sample_topwords = self.vocab_size
            self.u2 = self.u
            self.V2 = self.V
        else:
            self.u2 = self.u[:self.Mstep_sample_topwords]
            self.u2 = normalize(self.u2)
            self.V2 = self.V[:self.Mstep_sample_topwords]

        customStopwordList = re.split(r"\s+", self.customStopwords)
        for stop_w in customStopwordList:
            stopwordDict[stop_w] = 1
        print "Custom stopwords: %s" % (", ".join(customStopwordList))

        if 'fileLogger' not in kwargs:
            self.logfilename = kwargs.get('logfilename', "topicvecDir")
            self.fileLogger = initFileLogger(self.logfilename,
                                             self.appendLogfile)
        else:
            self.fileLogger = kwargs['fileLogger']

        self.fileLogger.debug("topicvecDir() init at %s", time.ctime())
        self.precompute()
        self.setK(K)

        self.docs_name = []
        self.docs_idx = []
        self.docs_wids = []
        self.wid2freq = []
        self.wids_freq = []
        self.expVT = None
        self.T = self.r = self.sum_pi_v = None
        self.docs_L = []
        self.docs_Pi = []
        self.docs_theta = []
        self.totalL = 0
        self.kmeans_xtoc = self.kmeans_distances = None
        # current iteration number
        self.it = 0