def logentropybuildspace(searchobject, morphdict, sentences): """ currently unused :param allheadwords: :param morphdict: :param sentences: :return: """ sentences = [[w for w in words.lower().split() if w] for words in sentences if words] sentences = [s for s in sentences if s] bagsofwords = buildwordbags(searchobject, morphdict, sentences) logentropydictionary = corpora.Dictionary(bagsofwords) logentropycorpus = [logentropydictionary.doc2bow(bag) for bag in bagsofwords] logentropyxform = LogEntropyModel(logentropycorpus) lsixform = LsiModel(corpus=logentropycorpus, id2word=logentropydictionary, onepass=False, num_topics=400) corpus = LogEntropyVectorCorpus(lsixform, logentropyxform, logentropydictionary, logentropycorpus, bagsofwords, sentences) return corpus
def lsibuildspace(searchobject, morphdict, sentences): """ :param allheadwords: :param morphdict: :param sentences: :return: """ sentences = [[w for w in words.lower().split() if w] for words in sentences if words] sentences = [s for s in sentences if s] bagsofwords = buildwordbags(searchobject, morphdict, sentences) lsidictionary = corpora.Dictionary(bagsofwords) lsicorpus = [lsidictionary.doc2bow(bag) for bag in bagsofwords] termfreqinversedocfreq = TfidfModel(lsicorpus) corpustfidf = termfreqinversedocfreq[lsicorpus] semanticindex = LsiModel(corpustfidf, id2word=lsidictionary, num_topics=250) """ "An empirical study of required dimensionality for large-scale latent semantic indexing applications" Bradford 2008 For a term-document matrix that has been decomposed via SVD with a non-zero diagonal... Dimensionality is reduced by deleting all but the k largest values on this diagonal, together with the corresponding columns in the other two matrices. This truncation process is used to generate a k-dimensional vector space. Both terms and documents are represented by k-dimensional vectors in this vector space. Landauer and Dumais in 1997: They found that the degree of match between cosine measures in the LSI space and human judgment was strongly dependent on k, with a maximum for k = 300 It is clear that there will be a growing degradation of representational fidelity as the dimensionality is increased beyond 400. Depending upon the application, such behavior may preclude use of dimensionality greater than 400. recommendations: 300: thousands to 10s of thousands """ corpus = LSIVectorCorpus(semanticindex, corpustfidf, lsidictionary, lsicorpus, bagsofwords, sentences) return corpus
def ldatopicgraphing(sentencetuples, workssearched, searchobject, headwordstops=True): """ a sentence tuple looks like: ('gr2397w001_ln_42', 'ποίῳ δὴ τούτων ἄξιον τὸν κόϲμον φθείρεϲθαι φάναι') see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py see also: https://nlpforhackers.io/topic-modeling/ CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151 max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example: max_df = 0.50 means "ignore terms that appear in more than 50% of the documents". max_df = 25 means "ignore terms that appear in more than 25 documents". The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms. min_df is used for removing terms that appear too infrequently. For example: min_df = 0.01 means "ignore terms that appear in less than 1% of the documents". min_df = 5 means "ignore terms that appear in less than 5 documents". The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms. notes: maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc. maxfreq of on the general issue of graphing see also: https://speakerdeck.com/bmabey/visualizing-topic-models https://de.dariah.eu/tatom/topic_model_visualization.html on the axes: https://stats.stackexchange.com/questions/222/what-are-principal-component-scores :param sentencetuples: :param activepoll: :return: """ if headwordstops: stops = mostcommonwordsviaheadwords() else: stops = mostcommoninflectedforms() sentencetuples = [(a, removestopwords(b, stops)) for a, b in sentencetuples] activepoll = searchobject.poll vv = searchobject.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } # not easy to store/fetch since you need both ldavectorizer and ldamodel # so we just store the actual graph... ldavishtmlandjs = checkforstoredvector(searchobject, 'lda') if not ldavishtmlandjs: sentencetuples = [ s for s in sentencetuples if len(s[1].strip().split(' ')) > settings['mustbelongerthan'] ] sentences = [s[1] for s in sentencetuples] sentencesaslists = [s.split(' ') for s in sentences] allwordsinorder = [ item for sublist in sentencesaslists for item in sublist if item ] activepoll.statusis('Finding all headwords') morphdict = getrequiredmorphobjects(set(allwordsinorder), furtherdeabbreviate=True) morphdict = convertmophdicttodict(morphdict) bagsofwordlists = buildwordbags(searchobject, morphdict, sentencesaslists) bagsofsentences = [' '.join(b) for b in bagsofwordlists] # print('bagsofsentences[:3]', bagsofsentences[3:]) activepoll.statusis('Running the LDA vectorizer') # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=settings['maxfreq'], min_df=settings['minfreq'], max_features=settings['maxfeatures']) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) ldamodel = LatentDirichletAllocation( n_components=settings['components'], max_iter=settings['iterations'], learning_method='online', learning_offset=50., random_state=0) ldamodel.fit(ldavectorized) visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer) # pyLDAvis.save_html(visualisation, 'ldavis.html') ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation) storevectorindatabase(searchobject, 'lda', ldavishtmlandjs) jsonoutput = ldatopicsgenerateoutput(ldavishtmlandjs, searchobject) return jsonoutput
def ldatopicmodeling(sentencetuples, searchobject): """ see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see sample results at end of file :param sentencetuples: :param activepoll: :return: """ maxfeatures = 2000 components = 15 topwords = 15 maxfreq = .60 minfreq = 5 iterations = 12 mustbelongerthan = 2 sentencetuples = [ s for s in sentencetuples if len(s[1].strip().split(' ')) > mustbelongerthan ] sentences = [s[1] for s in sentencetuples] sentences = [s.split(' ') for s in sentences] allwordsinorder = [ item for sublist in sentences for item in sublist if item ] morphdict = getrequiredmorphobjects(set(allwordsinorder)) morphdict = convertmophdicttodict(morphdict) bagsofwords = buildwordbags(searchobject, morphdict, sentences) bagsofsentences = [' '.join(b) for b in bagsofwords] # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=maxfreq, min_df=minfreq, max_features=maxfeatures) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) lda = LatentDirichletAllocation(n_components=components, max_iter=iterations, learning_method='online', learning_offset=50., random_state=0) lda.fit(ldavectorized) print("\nTopics in LDA model:") tf_feature_names = ldavectorizer.get_feature_names() print_top_words(lda, tf_feature_names, topwords) # Use tf-idf features for NMF. tfidfvectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=maxfeatures) tfidf = tfidfvectorizer.fit_transform(bagsofsentences) # Fit the NMF model nmf = NMF(n_components=components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (Frobenius norm):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) # Fit the NMF model print( "Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (len(sentences), maxfeatures)) nmf = NMF(n_components=components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) return
def buildgensimmodel(searchobject, morphdict: dict, sentences: List[str]) -> Word2Vec: """ returns a Word2Vec model then you use one of the many ill-documented class functions that come with the model to make queries against it WordEmbeddingsKeyedVectors in keyedvectors.py is your friend here for learning what you can really do most_similar(positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None) [analogies; most_similar(positive=['woman', 'king'], negative=['man']) --> queen] similar_by_word(word, topn=10, restrict_vocab=None) [the top-N most similar words] similar_by_vector(vector, topn=10, restrict_vocab=None) similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL) wmdistance(document1, document2) [Word Mover's Distance between two documents] most_similar_cosmul(positive=None, negative=None, topn=10) [analogy finder; most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) --> iraq] cosine_similarities(vector_1, vectors_all) distances(word_or_vector, other_words=()) distance(w1, w2) [distance('woman', 'man')] similarity(w1, w2) [similarity('woman', 'man')] n_similarity(ws1, ws2) [sets of words: n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])] FYI: Doc2VecKeyedVectors doesnt_match(docs) [Which doc from the given list doesn't go with the others?] note that Word2Vec will hurl out lots of DeprecationWarnings; we are blocking them one hopes that this does not yield a surprise some day... [surprise: it did...] this code is a candidate for refactoring because of the gensim 3.8 vs 4.0 API difference a drop down from model to model.wv requires refactoring dependent functions :return: """ vv = searchobject.vectorvalues sentences = [[w for w in words.lower().split() if w] for words in sentences if words] sentences = [s for s in sentences if s] bagsofwords = buildwordbags(searchobject, morphdict, sentences) # debugmessage('first bag is {b}'.format(b=bagsofwords[0])) # debugmessage('# of bags is {b}'.format(b=len(bagsofwords))) workers = setthreadcount() computeloss = False # Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread # (workers=1), to eliminate ordering jitter from OS thread scheduling. try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) try: gensimmodel = Word2Vec( bagsofwords, min_count=vv.minimumpresence, seed=1, iter=vv.trainingiterations, size=vv.dimensions, sample=vv.downsample, sg=1, # the results seem terrible if you say sg=0 window=vv.window, workers=workers, compute_loss=computeloss) except TypeError: # TypeError: __init__() got an unexpected keyword argument 'iter' # i.e., gensim 4.0.0 changed the API # see: https://radimrehurek.com/gensim/models/word2vec.html # # class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, # window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, # hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, # null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), # comment=None, max_final_vocab=None) # # epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter) # vector_size (int, optional) – Dimensionality of the word vectors. gensimmodel = Word2Vec( bagsofwords, min_count=vv.minimumpresence, seed=1, epochs=vv.trainingiterations, vector_size=vv.dimensions, sample=vv.downsample, sg=1, # the results seem terrible if you say sg=0 window=vv.window, workers=workers, compute_loss=computeloss) except RuntimeError: # RuntimeError: you must first build vocabulary before training the model # this will happen if you have a tiny author with too few words gensimmodel = None if computeloss: print('loss after {n} iterations was: {l}'.format( n=vv.trainingiterations, l=gensimmodel.get_latest_training_loss())) reducedmodel = None if gensimmodel: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) try: gensimmodel.delete_temporary_training_data( replace_word_vectors_with_normalized=True) except AttributeError: # AttributeError: 'Word2Vec' object has no attribute 'delete_temporary_training_data' # i.e., gensim 4.0.0 changed the API # see: https://radimrehurek.com/gensim/models/word2vec.html # If you’re finished training a model (i.e. no more updates, only querying), you can switch to the KeyedVectors instance: # word_vectors = model.wv # del model # this complicates our backwards-compatible-life, though. # we want to return a Word2Vec and not a KeyedVectors instance # gensimmodel = gensimmodel.wv reducedmodel = Word2Vec( [["cat", "say", "meow"], ["dog", "say", "woof"]], min_count=1) reducedmodel.wv = gensimmodel.wv if reducedmodel: gensimmodel = reducedmodel # print(gensimmodel.wv['ludo']) storevectorindatabase(searchobject, 'nn', gensimmodel) return gensimmodel