Example #1
0
def stage2():
    with open("../working/tX.dat", "rb") as picklefile:
        X = cPickle.load(picklefile)

        # Transform X to tf-idf
    bin_word_counter = pruning.WordCounter(X, binary=True)
    similarity.transform_tfidf(X, bin_word_counter)
    del bin_word_counter
    ##Save state
    with open("../working/tX.dat", "wb") as picklefile:
        cPickle.dump(X, picklefile, -1)
Example #2
0
def stage2():
    with open("../working/tX.dat", 'rb') as picklefile:
        X = cPickle.load(picklefile)

    # Transform X to tf-idf
    bin_word_counter = pruning.WordCounter(X, binary=True)
    similarity.transform_tfidf(X, bin_word_counter)
    del bin_word_counter
    ##Save state
    with open("../working/tX.dat", 'wb') as picklefile:
        cPickle.dump(X, picklefile, -1)
Example #3
0
def onego_main():
    # Test on toyset.
    preproc.subset("../raw_data/train.csv", "../data/train.csv", 1, 200000)

    # Load toyset .csv -> X & Y
    X, Y = preproc.extract_XY("../data/train.csv")

    # Prune corpora
    label_counter = pruning.LabelCounter(Y)
    word_counter = pruning.WordCounter(X)
    label_counter.prune(no_below=2, no_above=1.0, max_n=None)
    word_counter.prune(no_below=2, no_above=0.4, max_n=None)  # assume balanced
    pruning.prune_corpora(X, Y, label_counter, word_counter)
    del word_counter  # free up memory

    # Transform X to tf-idf
    bin_word_counter = pruning.WordCounter(X, binary=True)
    similarity.transform_tfidf(X, bin_word_counter)
    del bin_word_counter  # free up memory

    # Load hierarchy (parents & children indices)
    parents_index = preproc.extract_parents(Y, "../raw_data/hierarchy.txt")
    children_index = preproc.inverse_index(parents_index)

    # CV-split X & Y (using default params)
    v_X, v_Y, t_X, t_Y = cv.prop_sample_CV(X=X, Y=Y)
    del X, Y  # free up memory

    # Obtain k-NN scores & pscores, predict, and calculate F1!
    k = 70
    w1, w2, w3, w4 = 3.4, 0.6, 0.8, 0.2
    alpha = 0.9
    cat_pns = evaluation.CategoryPNCounter()
    for d_i, labels_i in izip(v_X, v_Y):
        scores, pscores = similarity.cossim(d_i, t_X, k, t_Y, parents_index,
                                            children_index)
        ranks = similarity.optimized_ranks(scores, pscores, label_counter, w1,
                                           w2, w3, w4)
        predicted_labels = similarity.predict(ranks, alpha)
        cat_pns.fill_pns(predicted_labels, labels_i)
    cat_pns.calculate_cat_pr()
    MaF = cat_pns.calculate_MaF()

    print "MaF:", MaF
Example #4
0
def onego_main():
    # Test on toyset.
    preproc.subset("../raw_data/train.csv", "../data/train.csv", 1, 200000)

    # Load toyset .csv -> X & Y
    X, Y = preproc.extract_XY("../data/train.csv")

    # Prune corpora
    label_counter = pruning.LabelCounter(Y)
    word_counter = pruning.WordCounter(X)
    label_counter.prune(no_below=2, no_above=1.0, max_n=None)
    word_counter.prune(no_below=2, no_above=0.4, max_n=None)  # assume balanced
    pruning.prune_corpora(X, Y, label_counter, word_counter)
    del word_counter  # free up memory

    # Transform X to tf-idf
    bin_word_counter = pruning.WordCounter(X, binary=True)
    similarity.transform_tfidf(X, bin_word_counter)
    del bin_word_counter  # free up memory

    # Load hierarchy (parents & children indices)
    parents_index = preproc.extract_parents(Y, "../raw_data/hierarchy.txt")
    children_index = preproc.inverse_index(parents_index)

    # CV-split X & Y (using default params)
    v_X, v_Y, t_X, t_Y = cv.prop_sample_CV(X=X, Y=Y)
    del X, Y  # free up memory

    # Obtain k-NN scores & pscores, predict, and calculate F1!
    k = 70
    w1, w2, w3, w4 = 3.4, 0.6, 0.8, 0.2
    alpha = 0.9
    cat_pns = evaluation.CategoryPNCounter()
    for d_i, labels_i in izip(v_X, v_Y):
        scores, pscores = similarity.cossim(d_i, t_X, k, t_Y, parents_index, children_index)
        ranks = similarity.optimized_ranks(scores, pscores, label_counter, w1, w2, w3, w4)
        predicted_labels = similarity.predict(ranks, alpha)
        cat_pns.fill_pns(predicted_labels, labels_i)
    cat_pns.calculate_cat_pr()
    MaF = cat_pns.calculate_MaF()

    print "MaF:", MaF