Esempio n. 1
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    """Fit parsimonious language models to docs.

    A parsimonious language model shows which words "stand out" in each
    document when compared to the full set. These words are the ones you
    might want to display in a word cloud.

    This function fits a background model to all of docs, then fits individual
    models to each document in turn using the background model.

    Parameters
    ----------
    docs : list
        List of documents.

    w : float
        Weight assigned to the document terms when fitting individual models,
        relative to the background model. Should be a number between 0
        (background model only) and 1 (background model disabled).

    k : integer
        Number of terms to return per document.

    Returns
    -------
    terms : list of list of (string, float)
        For each document in docs, a top-k list of most probable words and
        their log-probabilities.
    """
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(k, d) for d in docs]
Esempio n. 2
0
    ('David Copperfield',  '766'),
    ('Great Expectations', '1400'),
]

startbook = """*** START OF THIS PROJECT GUTENBERG EBOOK """


def read_book(title, num):
    """Returns generator over words in book num"""

    logger.info("Fetching terms from %s" % title)
    path = "%s.txt.utf8.gz" % num
    in_book = False
    for ln in gzip.open(path):
        if in_book:
            for w in re.sub(r"[.,:;!?\"']", " ", ln).lower().split():
                yield w
        elif ln.startswith(startbook):
            in_book = True


book_contents = [(title, list(read_book(title, num))) for title, num in books]

model = ParsimoniousLM([terms for title, terms in book_contents], w=.01)

for title, terms in book_contents:
    print("Top %d words in %s:" % (top_k, title))
    for term, p in model.top(top_k, terms):
        print("    %s %.4f" % (term, np.exp(p)))
    print("")
Esempio n. 3
0
def weighwords(path):
    print('Processing parsimonious weighwords for %s' % path)

    files_terms = [data for _, data in files_combined_terms(path)]
    plm = ParsimoniousLM(files_terms, w=PLM_W)
    swlm = SignificantWordsLM(files_terms, SWLM_LAMBDAS)

    print()
    print()

    if START_YEAR and START_MONTH:
        start_year = START_YEAR
        start_month = START_MONTH + 1
    else:
        today = datetime.today()
        previous_month = datetime(today.year, today.month,
                                  1) - relativedelta(months=1)
        start_year = previous_month.year
        start_month = previous_month.month

    for name, terms in files_combined_terms(path):
        print("######  {}  ######".format(name))
        if not terms:
            print('<leeg>')
            continue

        top_terms = plm.top(MODEL_RESULT_AMOUNT, terms)
        swlm_top = swlm.group_top(
            MODEL_RESULT_AMOUNT,
            grouper(terms, math.ceil(len(terms) / 10)),
            fix_lambdas=True,
        )

        print(
            f"{'=ParsimoniousLM (not used)':40} {'score':12} {'count':4}         {'=SignificantWordsLM':40} {'score':12} {'count'}"
        )
        for (plm_t, plm_p), (swlm_t, swlm_p) in zip(top_terms, swlm_top):
            plm_c = term_occurs(plm_t, '%s/%s.dump' % (path, name))
            swlm_c = term_occurs(swlm_t, '%s/%s.dump' % (path, name))

            if swlm_c < OCCURS_THRESHOLD:
                continue

            print(
                f"{plm_t:<40} {np.exp(plm_p):<12.4f} {plm_c:<4.2}          {swlm_t:<40} {swlm_p:<12.4f} {swlm_c:<4.2}"
            )

        print()
        print()

        year = int(name[0:4])
        month = int(name[5:7])

        if year < start_year:
            continue

        if year == start_year and month < start_month:
            continue

        print('Saving to redis: raadstalk.%s' % name)
        print()

        r.delete('raadstalk.%s' % name)
        for term, _ in swlm_top:
            if term_occurs(term, '%s/%s.dump' %
                           (path, name)) < OCCURS_THRESHOLD:
                continue
            r.rpush('raadstalk.%s' % name, term)
Esempio n. 4
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    """Fit a parsimonious language model to terms in docs."""
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(10, d) for d in docs]
        with codecs.open(filename, 'r', 'utf-8') as IN:
            subtitle = IN.readlines()
        # Use only the text line and filter out the metadata lines
        # containing timestamps etc.
        subtitle = subtitle[4::4]
        # Tokenize the text using NLTK
        tokens = nltk.tokenize.word_tokenize(' '.join(subtitle),
                                             language='dutch')
        # Change all characters to lowercase
        tokens = [i.lower() for i in tokens]
        for token in [token for token in tokens if re.match('\w+', token)]:
            words.append(token)

        loaded_documents_count += 1

    print 'Loaded %d documents\n' % (loaded_documents_count)
    # Add the list with all collected words for this program to
    # documents
    documents.append((program, words))

# Create the Parsimonious Language Model for the documents
model = ParsimoniousLM([words for program, words in documents], w=weight)
for program, words in documents:
    print "\nTop %d words for %s:" % (top_words, program)
    with codecs.open('wordcloud_%s.txt' % (program), 'w', 'utf-8') as OUT:
        # Generate the top words for a program
        for word, score in model.top(top_words, words):
            result_line = "%s:%.6f" % (word, math.exp(score))
            print result_line
            OUT.write('%s\n' % (result_line))
Esempio n. 6
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    """Fit a parsimonious language model to terms in docs."""
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(10, d) for d in docs]
Esempio n. 7
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(10, d) for d in docs]
        # Use only the text line and filter out the metadata lines
        # containing timestamps etc.
        subtitle = subtitle[4::4]
        # Tokenize the text using NLTK
        tokens = nltk.tokenize.word_tokenize(
            ' '.join(subtitle),
            language='dutch'
        )
        # Change all characters to lowercase
        tokens = [i.lower() for i in tokens]
        for token in [token for token in tokens if re.match('\w+', token)]:
            words.append(token)

        loaded_documents_count += 1

    print 'Loaded %d documents\n' % (loaded_documents_count)
    # Add the list with all collected words for this program to
    # documents
    documents.append((program, words))

# Create the Parsimonious Language Model for the documents
model = ParsimoniousLM([words for program, words in documents], w=weight)
for program, words in documents:
    print "\nTop %d words for %s:" % (top_words, program)
    with codecs.open('wordcloud_%s.txt' % (program), 'w', 'utf-8') as OUT:
        # Generate the top words for a program
        for word, score in model.top(top_words, words):
            result_line = "%s:%.6f" % (word, math.exp(score))
            print result_line
            OUT.write('%s\n' % (result_line))