Example #1
0
    num = 0
    print "Finding counts: ",
    for ky in data_table.keys():
        if ((num % 1000) == 0):
            print "{} ".format(num),
            sys.stdout.flush()
        xact = data_table[ky]
        if ((language_filter != 'none')
                and (xact['lid_lui'] != language_filter)
            ):  # do topic only for docs that match language filter
            print('Not processing transaction {}'.format(xact))
            continue
        # Topic normalization
        topic_norm.normalize_msg(xact, rw_hash, debug)
        # Get counts
        xact['counts'] = mt.get_counts(xact['msg_topic'])
        num += 1
    print

    # Write out counts to a file and perform topic clustering
    fn_counts = dir_temp + fn_table + '.{}.counts.txt'.format(num_topics)

    if (os.path.exists(fn_counts)):
        os.remove(fn_counts)
    mt.write_counts_file(data_table, fn_counts)

    # Run topic clustering binary
    fn_feat = dir_temp + fn_table + '.{}.feat.txt'.format(num_topics)
    fn_model = dir_temp + fn_table + '.{}.plsa'.format(num_topics)
    cmd = '{}/bin/plsa_estimation_combined_file -vector_list_in {} '.format(dir_topic, fn_counts) + \
        '-stop_list_in {} '.format(stop_list) + \
Example #2
0
    # Normalization and counts
    rw_hash = mt.create_utf8_rewrite_hash()
    num = 0
    print "Finding counts: ",
    for ky in data_table.keys():
        if ((num % 1000)==0):
            print "{} ".format(num),
            sys.stdout.flush()
        xact = data_table[ky]
        if ((language_filter != 'none') and (xact['lid_lui'] != language_filter)):  # do topic only for docs that match language filter
            print('Not processing transaction {}'.format(xact))
            continue
        # Topic normalization
        topic_norm.normalize_msg(xact, rw_hash, debug)
        # Get counts
        xact['counts'] = mt.get_counts(xact['msg_topic'])
        num += 1
    print

    # Write out counts to a file and perform topic clustering
    fn_counts = dir_temp + fn_table + '.{}.counts.txt'.format(num_topics)

    if (os.path.exists(fn_counts)):
        os.remove(fn_counts)
    mt.write_counts_file(data_table, fn_counts)

    # Run topic clustering binary
    fn_feat = dir_temp + fn_table + '.{}.feat.txt'.format(num_topics)
    fn_model = dir_temp + fn_table + '.{}.plsa'.format(num_topics)
    cmd = '{}/bin/plsa_estimation_combined_file -vector_list_in {} '.format(dir_topic, fn_counts) + \
        '-stop_list_in {} '.format(stop_list) + \