Python build_termlistの例、boringmatrix.build_termlist Pythonの例

コード例 #1

0

ファイルを表示

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    neato_out = []
    
    vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS)
    
    sorted_sums = sorted(vector_sums.items(),
                         key=itemgetter(1), # (1) is value
                         reverse=True)
    
    for itempair in sorted_sums:
        sorted_weights = sorted(boringmatrix.cooccurrence_weights(results[NOTE_BEGINS[0]][itempair[0]], results[NOTE_BEGINS[1]][itempair[0]]).items(),
                                key=itemgetter(1),
                                reverse=True)

        #wcnt = max(10, int(math.floor(len(sorted_weights) * 0.10)))
        #wcnt = min(10, int(math.floor(len(sorted_weights) * 0.10)))
        wcnt = min(10, len(sorted_weights))

        neato_out.append((str(boringmatrix.datetime_from_long(itempair[0])),
                          itempair[1],
                          len(sorted_weights),
                          sorted_weights[0:wcnt]))

    with open("%s.out" % output_name, 'w') as fout:
        fout.write(dumps(neato_out, indent=4))

コード例 #2

0

ファイルを表示

ファイル: build_topterms.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) != 5:
        usage()
        sys.exit(-1)

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)

    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    #        for start in results[NOTE_BEGINS[0]]:
    #            for note in NOTE_BEGINS:
    #                total = 0.0
    #                for term in results[note][start].term_weights:
    #                    total += results[note][start].term_weights[term]
    #                print total,
    # 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(
        results)  # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)

    new_matrix = {}
    for note in results:
        for start in results[note]:
            for term, value in results[note][start].term_matrix.items():
                try:
                    new_matrix[term] += value
                except KeyError:
                    new_matrix[term] = value

    with open("%s_top_terms_tuples.json" % output_name, 'w') as fout:
        fout.write(
            dumps(vectorspace.top_terms_tuples(new_matrix, 10000), indent=4))

コード例 #3

0

ファイルを表示

ファイル: build_termcounts.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 7 or len(sys.argv) > 9:
        usage()
        sys.exit(-1)

    use_short_terms = False
    use_file_out = False
    value = 0

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-file" == sys.argv[idx]:
                use_file_out = True
            elif "-value" == sys.argv[idx]:
                value = int(sys.argv[idx + 1])
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    output_count_graphs(results[NOTE_BEGINS[0]],
                        results[NOTE_BEGINS[1]],
                        "%s_counters_gt_%d" % (output_name, value),
                        value,
                        use_file_out)

コード例 #4

0

ファイルを表示

ファイル: build_distinct.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False
    use_file_out = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-file" == sys.argv[idx]:
                use_file_out = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)

    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    #        for start in results[NOTE_BEGINS[0]]:
    #            for note in NOTE_BEGINS:
    #                total = 0.0
    #                for term in results[note][start].term_weights:
    #                    total += results[note][start].term_weights[term]
    #                print total,
    # 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(
        results)  # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(
        results)  # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    output_distinct_graphs(results[NOTE_BEGINS[0]], results[NOTE_BEGINS[1]],
                           "%s_distinct" % (output_name), use_file_out)

コード例 #5

0

ファイルを表示

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False
    full_term_matrix_out = False
    merged_term_matrix_out = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-ftm" == sys.argv[idx]:
                full_term_matrix_out = True
            elif "-mtm" == sys.argv[idx]:
                merged_term_matrix_out = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)

    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(
        results)  # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(
        results)  # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    if use_short_terms and full_term_matrix_out:
        raise Exception("Cannot use short and full at the same time buddy")

    # ----------------------------------------------------------------------
    # Output a CSV with a model built from merging boston and i495 for each
    # t.  Using the short list, or whatever is set.
    if merged_term_matrix_out:
        merged = {}
        for start in results[NOTE_BEGINS[0]]:
            x = boringmatrix.BoringMatrix(None)

            for note in NOTE_BEGINS:
                for term in results[note][start].term_matrix:
                    val = results[note][start].term_matrix[term]

                    try:
                        x.term_matrix[term] += val
                    except KeyError:
                        x.term_matrix[term] = val

                    x.total_count += val

            if use_short_terms:
                x.drop_not_in(sterm_list)

            x.compute()
            merged[start] = x

        if use_short_terms:
            boringmatrix.output_full_matrix(sterm_list, merged,
                                            "%s_merged.csv" % output_name)
        else:
            boringmatrix.output_full_matrix(term_list, merged,
                                            "%s_merged.csv" % output_name)
    elif full_term_matrix_out:
        for note in NOTE_BEGINS:
            output = "%s_%s_full.csv" % (output_name, note)
            boringmatrix.output_full_matrix(term_list, results[note], output)
    elif use_short_terms:
        for note in results:
            output = "%s_%s.csv" % (output_name, note)
            boringmatrix.output_full_matrix(sterm_list, results[note], output)

コード例 #6

0

ファイルを表示

ファイル: build_permutation.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    # ----------------------------------------------------------------------
    # Compute the permutation entropy for the window.
    #
    # Use set resemblance to get entropy probability value.
    for note in results:

        sorted_indices_dict = {}
        for start in results[note]:
            full_list = results[note][start].build_fulllist(term_list)
            indices = sorted_indices(full_list)

            try:
                sorted_indices_dict[str(indices)] += 1
            except KeyError:
                sorted_indices_dict[str(indices)] = 1
    
    # Compare to the number of slices.
    print "number of sorted indices: %d" % len(sorted_indices_dict)

コード例 #7

0

ファイルを表示

ファイル: build_pca.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)

    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    #        for start in results[NOTE_BEGINS[0]]:
    #            for note in NOTE_BEGINS:
    #                total = 0.0
    #                for term in results[note][start].term_weights:
    #                    total += results[note][start].term_weights[term]
    #                print total,
    # 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(
        results)  # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(
        results)  # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    # ----------------------------------------------------------------------
    # Output each slice for each area as a new-line broken up term count
    # file.  These values aren't normalized, so they're not terribly useful
    # yet.
    outdir = "%s_%s" % (output_name, "pca1")

    if os.path.exists(outdir):
        os.rmdir(outdir)

    os.mkdir(outdir)

    if use_short_terms:
        the_terms = sterm_list
    else:
        the_terms = term_list

    for note in results:
        for start in results[note]:
            filename = "%s-%d" % (note, start)

            values = []

            for term in the_terms:
                # Could probably just index with a try/catch.
                if term in results[note][start].term_matrix:
                    value = results[note][start].term_matrix[term]
                else:
                    value = 0
                values.append(value)

            try:
                data_str = "\n".join(["%d" % value for value in values])
            except TypeError, e:
                print type(values), type(values[0]), values[0], values[1]
                print e
                sys.exit(-2)

            with open(os.path.join(outdir, filename), 'w') as fout:
                fout.write(data_str)

コード例 #8

0

ファイルを表示

ファイル: build_similarity.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False
    use_file_out = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-file" == sys.argv[idx]:
                use_file_out = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()
    # ----------------------------------------------------------------------
    # Compute the cosine similarities. 
    # YOU NEED TO CALL .compute() before this or you'll get garbage.
    vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS)
    
    count_cosine = {}
    weight_cosine = {}

    # ----------------------------------------------------------------------
    # Compute the similarity and counts for the given models as well as the
    # entropy.
    for start in results[NOTE_BEGINS[0]]:
        # These are identical... as they should be.  Really, I should be 
        # using these.
        # Totally different than those above.
        count_cosine[int(start)] = \
            boringmatrix.boring_count_similarity(results[NOTE_BEGINS[0]][start],
                                                 results[NOTE_BEGINS[1]][start])

        weight_cosine[int(start)] = \
            boringmatrix.boring_weight_similarity(results[NOTE_BEGINS[0]][start],
                                                  results[NOTE_BEGINS[1]][start])
            
    # Consider using a few panes.
    output_similarity_gnuplot(vector_sums, 
                              "%s_sims" % output_name, 
                              use_file_out)
    output_similarity_gnuplot(count_cosine,
                              "%s_sims_count" % output_name,
                              use_file_out)
    output_similarity_gnuplot(weight_cosine,
                              "%s_sims_weight" % output_name,
                              use_file_out)

    for start in count_cosine:
        if count_cosine[start] > 0.8:
            print start
            print terms_in_common(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start])
            print set_resemblance(results[NOTE_BEGINS[0]][start], results[NOTE_BEGINS[1]][start])
            print "x" * 20

コード例 #9

0

ファイルを表示

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False
    use_file_out = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-file" == sys.argv[idx]:
                use_file_out = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)

    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    #        for start in results[NOTE_BEGINS[0]]:
    #            for note in NOTE_BEGINS:
    #                total = 0.0
    #                for term in results[note][start].term_weights:
    #                    total += results[note][start].term_weights[term]
    #                print total,
    # 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(
        results)  # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(
        results)  # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()
    # ----------------------------------------------------------------------
    # Compute the cosine similarities.
    # YOU NEED TO CALL .compute() before this or you'll get garbage.
    vector_sums = boringmatrix.get_vectorsums(results, NOTE_BEGINS)

    count_cosine = {}
    weight_cosine = {}

    # ----------------------------------------------------------------------
    # Compute the similarity and counts for the given models as well as the
    # entropy.
    for start in results[NOTE_BEGINS[0]]:
        # These are identical... as they should be.  Really, I should be
        # using these.
        # Totally different than those above.
        count_cosine[int(start)] = \
            boringmatrix.boring_count_similarity(results[NOTE_BEGINS[0]][start],
                                                 results[NOTE_BEGINS[1]][start])

        weight_cosine[int(start)] = \
            boringmatrix.boring_weight_similarity(results[NOTE_BEGINS[0]][start],
                                                  results[NOTE_BEGINS[1]][start])

    # Consider using a few panes.
    output_similarity_gnuplot(vector_sums, "%s_sims" % output_name,
                              use_file_out)
    output_similarity_gnuplot(count_cosine, "%s_sims_count" % output_name,
                              use_file_out)
    output_similarity_gnuplot(weight_cosine, "%s_sims_weight" % output_name,
                              use_file_out)

    for start in count_cosine:
        if count_cosine[start] > 0.8:
            print start
            print terms_in_common(results[NOTE_BEGINS[0]][start],
                                  results[NOTE_BEGINS[1]][start])
            print set_resemblance(results[NOTE_BEGINS[0]][start],
                                  results[NOTE_BEGINS[1]][start])
            print "x" * 20

コード例 #10

0

ファイルを表示

ファイル: build_csv.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False
    full_term_matrix_out = False
    merged_term_matrix_out = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
            elif "-ftm" == sys.argv[idx]:
                full_term_matrix_out = True
            elif "-mtm" == sys.argv[idx]:
                merged_term_matrix_out = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    if use_short_terms and full_term_matrix_out:
        raise Exception("Cannot use short and full at the same time buddy")

    # ----------------------------------------------------------------------
    # Output a CSV with a model built from merging boston and i495 for each
    # t.  Using the short list, or whatever is set.
    if merged_term_matrix_out:
        merged = {}
        for start in results[NOTE_BEGINS[0]]:
            x = boringmatrix.BoringMatrix(None)

            for note in NOTE_BEGINS:
                for term in results[note][start].term_matrix:
                    val = results[note][start].term_matrix[term]
                        
                    try:
                        x.term_matrix[term] += val
                    except KeyError:
                        x.term_matrix[term] = val
                        
                    x.total_count += val

            if use_short_terms:
                x.drop_not_in(sterm_list)

            x.compute()
            merged[start] = x

        if use_short_terms:
            boringmatrix.output_full_matrix(sterm_list,
                                            merged,
                                            "%s_merged.csv" % output_name)
        else:
            boringmatrix.output_full_matrix(term_list,
                                            merged,
                                            "%s_merged.csv" % output_name)
    elif full_term_matrix_out:
        for note in NOTE_BEGINS:
            output = "%s_%s_full.csv" % (output_name, note)
            boringmatrix.output_full_matrix(term_list,
                                            results[note],
                                            output)
    elif use_short_terms:
        for note in results:
            output = "%s_%s.csv" % (output_name, note)
            boringmatrix.output_full_matrix(sterm_list,
                                            results[note],
                                            output)

コード例 #11

0

ファイルを表示

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    # ----------------------------------------------------------------------
    # Convert to sets and compute the set resemblances, see if any are 
    # high, compared to each other at each t.
    termSets = {}
    for start in results[NOTE_BEGINS[0]]:
        set_a = termset.TermSet(results[NOTE_BEGINS[0]][start],
                                "%s.%s" % (NOTE_BEGINS[0], str(start)))
        set_b = termset.TermSet(results[NOTE_BEGINS[1]][start],
                                "%s.%s" % (NOTE_BEGINS[1], str(start)))

        termSets[start] = termset.set_resemblance(set_a, set_b)

    #print sorted(
    #             termSets.items(),
    #             key=itemgetter(1), # (1) is value
    #             reverse=True)

    # ------------------------------------------------------------------
    # Convert to sets and compute the set resemblances for the goal of 
    # clustering all the sets so that I can build a "table" for each t 
    # in T, the bin ID of Xt, Yt | counts --> so I have probabilities 
    # to build the entropy computation for the window.
    termSetsFull = []
    for note in results:
        for start in results[note]:
            termSetsFull.append(termset.TermSet(results[note][start],
                                                "%s.%s" % (note, str(start))))

    resem_matrix = {}
    length = len(termSetsFull)

    for i in xrange(0, length):
        resem_matrix[i] = {}

        for j in xrange(i + 1, length):
            resem_matrix[i][j] = termset.set_resemblance(termSetsFull[i],
                                                         termSetsFull[j])

    resem_values = {}
    for i in resem_matrix:
        for j in resem_matrix[i]:
            try:
                resem_values[resem_matrix[i][j]] += 1
            except KeyError:
                resem_values[resem_matrix[i][j]] = 1

    #print dumps(sorted(resem_values, reverse=True), indent=4)
    
    print "Resemblance Values Computed"

    resem_histogram = {0.1 : 0, 0.2 : 0, 0.3 : 0, 0.4 : 0, 0.5 : 0,
                       0.6 : 0, 0.7 : 0, 0.8 : 0, 0.9 : 0, 1.0 : 0}

    for value in resem_values.keys():
        if value <= 0.1:
            resem_histogram[0.1] += resem_values[value]
        elif value <= 0.2:
            resem_histogram[0.2] += resem_values[value]
        elif value <= 0.3:
            resem_histogram[0.3] += resem_values[value]
        elif value <= 0.4:
            resem_histogram[0.4] += resem_values[value]
        elif value <= 0.5:
            resem_histogram[0.5] += resem_values[value]
        elif value <= 0.6:
            resem_histogram[0.6] += resem_values[value]
        elif value <= 0.7:
            resem_histogram[0.7] += resem_values[value]
        elif value <= 0.8:
            resem_histogram[0.8] += resem_values[value]
        elif value <= 0.9:
            resem_histogram[0.9] += resem_values[value]
        else:
            resem_histogram[1.0] += resem_values[value]

    print dumps(resem_histogram, indent=4)

コード例 #12

0

ファイルを表示

ファイル: build_tfidf_docs.py プロジェクト: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) < 5 or len(sys.argv) > 6:
        usage()
        sys.exit(-1)

    use_short_terms = False

    # could use the stdargs parser, but that is meh.
    try:
        for idx in range(1, len(sys.argv)):
            if "-in" == sys.argv[idx]:
                model_file = sys.argv[idx + 1]
            elif "-out" == sys.argv[idx]:
                output_name = sys.argv[idx + 1]
            elif "-short" == sys.argv[idx]:
                use_short_terms = True
    except IndexError:
        usage()
        sys.exit(-2)

    if len(NOTE_BEGINS) != 2:
        sys.stderr.write("use this to compare two sets.\n")
        sys.exit(-1)

    # not building the model.
    results = None

    if model_file is None:
        sys.exit(-1)
        
    if output_name is None:
        sys.exit(-1)

    with open(model_file, 'r') as moin:
        results = loads(moin.read(), object_hook=boringmatrix.as_boring)
        # dict(loads(moin.read(), object_hook=as_boring))

    # ----------------------------------------------------------------------
    # Compute the term weights.
    boringmatrix.fix_boringmatrix_dicts(results)

#        for start in results[NOTE_BEGINS[0]]:
#            for note in NOTE_BEGINS:
#                total = 0.0
#                for term in results[note][start].term_weights:
#                    total += results[note][start].term_weights[term]
#                print total,
# 1.0 is the total weight, yay.

    print "number of slices: %d" % len(results[NOTE_BEGINS[0]])

    term_list = boringmatrix.build_termlist(results) # length of this is used to normalize
    sterm_list = boringmatrix.build_termlist2(results) # length of this is used to normalize

    print "Full Dictionary: %d" % len(term_list)
    print "Short Dictionary: %d" % len(sterm_list)

    # ----------------------------------------------------------------------
    # Prune out low term counts; re-compute.
    if use_short_terms:
        for note in results:
            for start in results[note]:
                results[note][start].drop_not_in(sterm_list)
                results[note][start].compute()

    # ----------------------------------------------------------------------
    # Just build a dictionary of the documents.
    results_as_dict = {}
    doc_length = {}
    doc_freq = {}
    top_terms_slist = None

    for note in results:
        for start in results[note]:
                    
            doc_id = "%s-%d" % (note, start)

            results_as_dict[doc_id] = results[note][start].term_matrix.copy()
            doc_length[doc_id] = results[note][start].total_count

            for term in results_as_dict[doc_id]:
                try:
                    doc_freq[term] += 1
                except KeyError:
                    doc_freq[term] = 1

    invdoc_freq = vectorspace.calculate_invdf(len(results_as_dict), doc_freq)

    doc_tfidf = \
        vectorspace.calculate_tfidf(doc_length, results_as_dict, invdoc_freq)

    with open("%s_%s" % (output_name, "top_tfidf.json"), 'w') as fout:
        fout.write(dumps(vectorspace.top_terms_overall(doc_tfidf,
                                                       TOP_TERM_CNT),
                         indent=4))

    top_terms_slist = \
        vectorspace.top_terms_overall(results_as_dict, int(len(doc_freq)*.10))

    with open("%s_%s" % (output_name, "top_tf.json"), 'w') as fout:
        fout.write(dumps(top_terms_slist, indent=4))

    for note in results:
        for start in results[note]:
            results[note][start].drop_not_in(top_terms_slist)
            results[note][start].compute()
                
        boringmatrix.output_full_matrix(top_terms_slist,
                                        results[note],
                                        "%s_%s_tops.csv" % (output_name, note))