Ejemplo n.º 1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
Ejemplo n.º 2
0
def words_freqs(records, features):
    #count each word in abstract and title:
    all_w = 0
    words = defaultdict(lambda: 0)
    for rec in gen_record(records, features+['mc']):
        words_l = text_to_words(" ".join([rec[f] for f in features]) )
        for c in words_l:
            words[c]+=1
            all_w+=1
    print "count of all words:", all_w
    print "words found:"
    w_sorted = sorted(list(words.iteritems()), key=lambda x:x[1], reverse=True)
    for k, v in w_sorted:
        print k, ":", v
def count_label_statistics(fname, fields):
    """
    Counts the following statistics and prints them. 
    D is the dataset filtered by the condition to contain all of the fields
    L is number of distinct labels in D.
    
    -Label cardinality:  the average number of labels of the examples in D
    -Label density: the average number of labels of the examples in D divided by |L|
    -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix 
    -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix 
    -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix 
    -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix 

    """

    all = 0
    labels = set()
    lc = 0
    ld = 0
    bolo11 = 0
    bolo12 = 0
    bolo21 = 0
    bolo22 = 0
    #count statistics
    for lmc in gen_lmc(lambda: gen_record(fname, fields), fields):
        all += 1
        for mc in lmc:
            labels.add(mc)
        lc += len(lmc)
        ld += len(lmc)
        if contains2of_same_prefix(lmc, 2):
            bolo11 += 1
        if contains2of_same_prefix(lmc, 3):
            bolo12 += 1
        if contains2of_diff_prefix(lmc, 2):
            bolo21 += 1
        if contains2of_diff_prefix(lmc, 3):
            bolo22 += 1
        print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(
            lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(
                lmc, 2), contains2of_diff_prefix(lmc, 3)

    #print statistics
    print "lc:", lc / all
    print "ld:", ld / (all * len(labels))

    print "bolo11 contain at least 2 of same 2 code prefix:", bolo11 / all
    print "bolo12 contain at least 2 of same 3 code prefix:", bolo12 / all
    print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21 / all
    print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22 / all
def count_label_statistics(fname, fields):
    """
    Counts the following statistics and prints them. 
    D is the dataset filtered by the condition to contain all of the fields
    L is number of distinct labels in D.
    
    -Label cardinality:  the average number of labels of the examples in D
    -Label density: the average number of labels of the examples in D divided by |L|
    -Bolo11: the percentage of documents that contain at least 2 labels of the same 2 code prefix 
    -Bolo12: the percentage of documents that contain at least 2 labels of the same 3 code prefix 
    -Bolo21: the percentage of documents that contain at least 2 labels of different 2 code prefix 
    -Bolo22: the percentage of documents that contain at least 2 labels of different 3 code prefix 

    """
    
    
    all = 0
    labels = set()
    lc = 0
    ld = 0
    bolo11 = 0
    bolo12 = 0
    bolo21 = 0
    bolo22 = 0
    #count statistics
    for lmc in gen_lmc(lambda: gen_record(fname, fields), fields):
        all+=1
        for mc in lmc:
            labels.add(mc)
        lc += len(lmc)
        ld += len(lmc)
        if contains2of_same_prefix(lmc, 2):
            bolo11 += 1
        if contains2of_same_prefix(lmc, 3):
            bolo12 += 1
        if contains2of_diff_prefix(lmc, 2):
            bolo21 += 1
        if contains2of_diff_prefix(lmc, 3):
            bolo22 += 1
        print '[count_label_statistics]: lmc:', lmc, contains2of_same_prefix(lmc, 2), contains2of_same_prefix(lmc, 3), contains2of_diff_prefix(lmc, 2), contains2of_diff_prefix(lmc, 3)
        
    #print statistics
    print "lc:", lc/all
    print "ld:", ld/(all*len(labels))
    
    print "bolo11 contain at least 2 of same 2 code prefix:", bolo11/all
    print "bolo12 contain at least 2 of same 3 code prefix:", bolo12/all
    print "bolo21 contain at least 2 of diff 2 code prefix:", bolo21/all
    print "bolo22 contain at least 2 of diff 3 code prefix:", bolo22/all
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    return labels, labelsset, prefix_code_generator, elements_count
Ejemplo n.º 6
0
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences,
                                filtered_by):
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(
        rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))

    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' + str(elements_count))

    return labels, labelsset, prefix_code_generator, elements_count
    print "codeprefixlen:", codeprefixlen
    print "mincodeoccurences", mincodeoccurences
    print "k:", k
    print "smoothingparam:", smoothingparam
    print "distancetrainingsteps:", distancetrainingsteps
    print "filtered_by:", filtered_by
    print "save_hierarchical_path:", save_hierarchical_path
    print "save_train_generator_path:", save_train_generator_path
    print "save_lenlabels_path:", save_lenlabels_path
    

    log_level = logging.INFO
    logging.basicConfig(level=log_level)
    
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    
    #generate labels
    print "generating labels..."
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    labelsset = set(labels)
    print "labels generated."
    print labels
    
    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    print "counting elements..."
    elements_count = len(list(prefix_code_generator()))
    print "number of elements:", elements_count
Ejemplo n.º 8
0
 except:
     print '5th argument: number of labels to consider'
     sys.exit(1)
 try:
     shuffling_cnt = int(sys.argv[6])
 except:
     print '6th argument: How many records to sample before the further filtering.'
     sys.exit(1)
 try:
     filtered_by = sys.argv[7:]
 except:
     print '7th argument: list of the fields to exist in considered records.'
     sys.exit(1)
 
 #prepare generators
 rec_generator_first = lambda: gen_record(fname, filtered_by)
 #choosing shuffling_cnt elements in random:
 PRINTER("shuffling in random")
 import random
 chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt)
 rec_generator = lambda: chosen_records
 
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 PRINTER("generating labels...")
 labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences)
 #PRINTER("labels generated."
 #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True)
 biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], 
from zbl2py import record_read
from classifier_tester import LeaveOneOutAllCategories
from classifier_knn import KnnMatrixClassifier

import os
#lib_path = os.path.abspath(os.path.sep.join(['..', 'topic_classification']))
#sys.path.append(lib_path)

lib_path = os.path.abspath(
    os.path.sep.join(['..', '..', '..', 'document_classification']))
sys.path.append(lib_path)
from data_io.zbl_record_generators import gen_record, mc2lmc_tomka_blad

if __name__ == '__main__':
    records_file = sys.argv[1]
    test_samples = int(sys.argv[2])

    print "Arguments read:"
    print "records_file =", records_file
    print "test_samples =", test_samples

    frecords = lambda: gen_record(records_file, ['mc', 'ti', 'ab', 'au'])

    #loo = LeaveOneOutAllCategories(KnnMatrixClassifier, frecords, mc2lmc_tomka_blad)
    #corr = loo.test(test_samples)
    #print "Correctness:", corr
    print "---training a classifier..."
    knn = KnnMatrixClassifier(frecords, 7000, 100, mc2lmc_tomka_blad)
    print "---performing leave one out..."
    corr = knn.loo(test_samples)
    print "COrrectness:", corr
sys.path.append(r'../') 
from zbl2py import record_read
from classifier_tester import LeaveOneOutAllCategories
from classifier_knn import KnnMatrixClassifier

import os
#lib_path = os.path.abspath(os.path.sep.join(['..', 'topic_classification']))
#sys.path.append(lib_path)

lib_path = os.path.abspath(os.path.sep.join(['..', '..', '..', 'document_classification']))
sys.path.append(lib_path)
from data_io.zbl_record_generators import gen_record, mc2lmc_tomka_blad

if __name__ == '__main__':
    records_file = sys.argv[1]
    test_samples = int(sys.argv[2])
    
    print "Arguments read:"
    print "records_file =", records_file
    print "test_samples =", test_samples
    
    frecords = lambda: gen_record(records_file, ['mc', 'ti', 'ab', 'au'])

    #loo = LeaveOneOutAllCategories(KnnMatrixClassifier, frecords, mc2lmc_tomka_blad)
    #corr = loo.test(test_samples)
    #print "Correctness:", corr
    print "---training a classifier..."
    knn = KnnMatrixClassifier(frecords, 7000, 100, mc2lmc_tomka_blad)
    print "---performing leave one out..."
    corr = knn.loo(test_samples)
    print "COrrectness:", corr