def parsimonious_wordcloud(docs, w=.5, k=10): """Fit parsimonious language models to docs. A parsimonious language model shows which words "stand out" in each document when compared to the full set. These words are the ones you might want to display in a word cloud. This function fits a background model to all of docs, then fits individual models to each document in turn using the background model. Parameters ---------- docs : list List of documents. w : float Weight assigned to the document terms when fitting individual models, relative to the background model. Should be a number between 0 (background model only) and 1 (background model disabled). k : integer Number of terms to return per document. Returns ------- terms : list of list of (string, float) For each document in docs, a top-k list of most probable words and their log-probabilities. """ from weighwords import ParsimoniousLM model = ParsimoniousLM(docs, w=w) return [model.top(k, d) for d in docs]
('David Copperfield', '766'), ('Great Expectations', '1400'), ] startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """ def read_book(title, num): """Returns generator over words in book num""" logger.info("Fetching terms from %s" % title) path = "%s.txt.utf8.gz" % num in_book = False for ln in gzip.open(path): if in_book: for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split(): yield w elif ln.startswith(startbook): in_book = True book_contents = [(title, list(read_book(title, num))) for title, num in books] model = ParsimoniousLM([terms for title, terms in book_contents], w=.01) for title, terms in book_contents: print("Top %d words in %s:" % (top_k, title)) for term, p in model.top(top_k, terms): print(" %s %.4f" % (term, np.exp(p))) print("")
def weighwords(path): print('Processing parsimonious weighwords for %s' % path) files_terms = [data for _, data in files_combined_terms(path)] plm = ParsimoniousLM(files_terms, w=PLM_W) swlm = SignificantWordsLM(files_terms, SWLM_LAMBDAS) print() print() if START_YEAR and START_MONTH: start_year = START_YEAR start_month = START_MONTH + 1 else: today = datetime.today() previous_month = datetime(today.year, today.month, 1) - relativedelta(months=1) start_year = previous_month.year start_month = previous_month.month for name, terms in files_combined_terms(path): print("###### {} ######".format(name)) if not terms: print('<leeg>') continue top_terms = plm.top(MODEL_RESULT_AMOUNT, terms) swlm_top = swlm.group_top( MODEL_RESULT_AMOUNT, grouper(terms, math.ceil(len(terms) / 10)), fix_lambdas=True, ) print( f"{'=ParsimoniousLM (not used)':40} {'score':12} {'count':4} {'=SignificantWordsLM':40} {'score':12} {'count'}" ) for (plm_t, plm_p), (swlm_t, swlm_p) in zip(top_terms, swlm_top): plm_c = term_occurs(plm_t, '%s/%s.dump' % (path, name)) swlm_c = term_occurs(swlm_t, '%s/%s.dump' % (path, name)) if swlm_c < OCCURS_THRESHOLD: continue print( f"{plm_t:<40} {np.exp(plm_p):<12.4f} {plm_c:<4.2} {swlm_t:<40} {swlm_p:<12.4f} {swlm_c:<4.2}" ) print() print() year = int(name[0:4]) month = int(name[5:7]) if year < start_year: continue if year == start_year and month < start_month: continue print('Saving to redis: raadstalk.%s' % name) print() r.delete('raadstalk.%s' % name) for term, _ in swlm_top: if term_occurs(term, '%s/%s.dump' % (path, name)) < OCCURS_THRESHOLD: continue r.rpush('raadstalk.%s' % name, term)
def parsimonious_wordcloud(docs, w=.5, k=10): """Fit a parsimonious language model to terms in docs.""" from weighwords import ParsimoniousLM model = ParsimoniousLM(docs, w=w) return [model.top(10, d) for d in docs]
with codecs.open(filename, 'r', 'utf-8') as IN: subtitle = IN.readlines() # Use only the text line and filter out the metadata lines # containing timestamps etc. subtitle = subtitle[4::4] # Tokenize the text using NLTK tokens = nltk.tokenize.word_tokenize(' '.join(subtitle), language='dutch') # Change all characters to lowercase tokens = [i.lower() for i in tokens] for token in [token for token in tokens if re.match('\w+', token)]: words.append(token) loaded_documents_count += 1 print 'Loaded %d documents\n' % (loaded_documents_count) # Add the list with all collected words for this program to # documents documents.append((program, words)) # Create the Parsimonious Language Model for the documents model = ParsimoniousLM([words for program, words in documents], w=weight) for program, words in documents: print "\nTop %d words for %s:" % (top_words, program) with codecs.open('wordcloud_%s.txt' % (program), 'w', 'utf-8') as OUT: # Generate the top words for a program for word, score in model.top(top_words, words): result_line = "%s:%.6f" % (word, math.exp(score)) print result_line OUT.write('%s\n' % (result_line))
def parsimonious_wordcloud(docs, w=.5, k=10): from weighwords import ParsimoniousLM model = ParsimoniousLM(docs, w=w) return [model.top(10, d) for d in docs]
# Use only the text line and filter out the metadata lines # containing timestamps etc. subtitle = subtitle[4::4] # Tokenize the text using NLTK tokens = nltk.tokenize.word_tokenize( ' '.join(subtitle), language='dutch' ) # Change all characters to lowercase tokens = [i.lower() for i in tokens] for token in [token for token in tokens if re.match('\w+', token)]: words.append(token) loaded_documents_count += 1 print 'Loaded %d documents\n' % (loaded_documents_count) # Add the list with all collected words for this program to # documents documents.append((program, words)) # Create the Parsimonious Language Model for the documents model = ParsimoniousLM([words for program, words in documents], w=weight) for program, words in documents: print "\nTop %d words for %s:" % (top_words, program) with codecs.open('wordcloud_%s.txt' % (program), 'w', 'utf-8') as OUT: # Generate the top words for a program for word, score in model.top(top_words, words): result_line = "%s:%.6f" % (word, math.exp(score)) print result_line OUT.write('%s\n' % (result_line))