Exemple #1
0
def weighwords(path):
    print('Processing parsimonious weighwords for %s' % path)

    files_terms = [data for _, data in files_combined_terms(path)]
    plm = ParsimoniousLM(files_terms, w=PLM_W)
    swlm = SignificantWordsLM(files_terms, SWLM_LAMBDAS)

    print()
    print()

    if START_YEAR and START_MONTH:
        start_year = START_YEAR
        start_month = START_MONTH + 1
    else:
        today = datetime.today()
        previous_month = datetime(today.year, today.month,
                                  1) - relativedelta(months=1)
        start_year = previous_month.year
        start_month = previous_month.month

    for name, terms in files_combined_terms(path):
        print("######  {}  ######".format(name))
        if not terms:
            print('<leeg>')
            continue

        top_terms = plm.top(MODEL_RESULT_AMOUNT, terms)
        swlm_top = swlm.group_top(
            MODEL_RESULT_AMOUNT,
            grouper(terms, math.ceil(len(terms) / 10)),
            fix_lambdas=True,
        )

        print(
            f"{'=ParsimoniousLM (not used)':40} {'score':12} {'count':4}         {'=SignificantWordsLM':40} {'score':12} {'count'}"
        )
        for (plm_t, plm_p), (swlm_t, swlm_p) in zip(top_terms, swlm_top):
            plm_c = term_occurs(plm_t, '%s/%s.dump' % (path, name))
            swlm_c = term_occurs(swlm_t, '%s/%s.dump' % (path, name))

            if swlm_c < OCCURS_THRESHOLD:
                continue

            print(
                f"{plm_t:<40} {np.exp(plm_p):<12.4f} {plm_c:<4.2}          {swlm_t:<40} {swlm_p:<12.4f} {swlm_c:<4.2}"
            )

        print()
        print()

        year = int(name[0:4])
        month = int(name[5:7])

        if year < start_year:
            continue

        if year == start_year and month < start_month:
            continue

        print('Saving to redis: raadstalk.%s' % name)
        print()

        r.delete('raadstalk.%s' % name)
        for term, _ in swlm_top:
            if term_occurs(term, '%s/%s.dump' %
                           (path, name)) < OCCURS_THRESHOLD:
                continue
            r.rpush('raadstalk.%s' % name, term)
        with codecs.open(filename, 'r', 'utf-8') as IN:
            subtitle = IN.readlines()
        # Use only the text line and filter out the metadata lines
        # containing timestamps etc.
        subtitle = subtitle[4::4]
        # Tokenize the text using NLTK
        tokens = nltk.tokenize.word_tokenize(' '.join(subtitle),
                                             language='dutch')
        # Change all characters to lowercase
        tokens = [i.lower() for i in tokens]
        for token in [token for token in tokens if re.match('\w+', token)]:
            words.append(token)

        loaded_documents_count += 1

    print 'Loaded %d documents\n' % (loaded_documents_count)
    # Add the list with all collected words for this program to
    # documents
    documents.append((program, words))

# Create the Parsimonious Language Model for the documents
model = ParsimoniousLM([words for program, words in documents], w=weight)
for program, words in documents:
    print "\nTop %d words for %s:" % (top_words, program)
    with codecs.open('wordcloud_%s.txt' % (program), 'w', 'utf-8') as OUT:
        # Generate the top words for a program
        for word, score in model.top(top_words, words):
            result_line = "%s:%.6f" % (word, math.exp(score))
            print result_line
            OUT.write('%s\n' % (result_line))
Exemple #3
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    """Fit a parsimonious language model to terms in docs."""
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(10, d) for d in docs]
Exemple #4
0
def parsimonious_wordcloud(docs, w=.5, k=10):
    from weighwords import ParsimoniousLM

    model = ParsimoniousLM(docs, w=w)
    return [model.top(10, d) for d in docs]