def print_codes_distribution(records):
    categs = defaultdict(lambda: 0)
    for rec in records:
        for c in rec['categories']:
            categs[c]+=1
    #print sorted categories:
    
    categs_occurences = defaultdict(lambda: 0)
    for _, v in categs.iteritems():
        categs_occurences[v]+=1
    
    msc_codes = sorted(list(categs_occurences.iteritems()), key=lambda x:x[1])
    for cd in msc_codes:
        print "|", cd[0], "||", cd[1]
        print "|-"

def print_docs_distribution(records):
    docs = defaultdict(lambda: 0)
    for rec in records:
        docs[len(rec['categories'])]+=1
    #print sorted categories:
    doc_distr = sorted(list(docs.iteritems()), key=lambda x:x[1])
    for cd in doc_distr:
        print "|", cd[0], "||", cd[1]
        print "|-"
    
    
if __name__=="__main__":
    records = record_read.read_list_records(sys.argv[1])
    print_codes_distribution(records)
    print_docs_distribution(records)
    occurences = {}
    for w in words:
        occurences[w] = 0

    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1

    return [occurences[w] for w in words]


def convert_records_to_words(records, words):
    """Convert list of records into list of words counts"""
    for rec in records:
        try:
            title = rec['ti']
            descr = rec['descr']
            kw = " ".join(rec['kw'])

            feats = calc_word_feats(" ".join([title, descr, kw]), words)
            yield (feats, rec['categories'])
        except:
            continue


if __name__ == '__main__':
    words = select_descriptive_words.select_descriptive_words_quotientmethod(
        sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4]))
    for i in convert_records_to_words(
            record_read.read_list_records(sys.argv[4]), words):
        print i
from wordsfreq import select_descriptive_words
from features import records_to_words_weights_converter
from zbl2py import record_read
from classifier_tester import LeaveOneOutAllCategories
from classifier_svm import SvmWordsClassifier

if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    test_samples = int(sys.argv[6])
    
    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "test_samples =", test_samples
    
    words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    loo = LeaveOneOutAllCategories(SvmWordsClassifier, frecords)
    corr = loo.test(test_samples)
    print "Correctness:", corr
    categs = defaultdict(lambda: 0)
    for rec in records:
        for c in rec['categories']:
            categs[c] += 1
    #print sorted categories:

    categs_occurences = defaultdict(lambda: 0)
    for _, v in categs.iteritems():
        categs_occurences[v] += 1

    msc_codes = sorted(list(categs_occurences.iteritems()), key=lambda x: x[1])
    for cd in msc_codes:
        print "|", cd[0], "||", cd[1]
        print "|-"


def print_docs_distribution(records):
    docs = defaultdict(lambda: 0)
    for rec in records:
        docs[len(rec['categories'])] += 1
    #print sorted categories:
    doc_distr = sorted(list(docs.iteritems()), key=lambda x: x[1])
    for cd in doc_distr:
        print "|", cd[0], "||", cd[1]
        print "|-"


if __name__ == "__main__":
    records = record_read.read_list_records(sys.argv[1])
    print_codes_distribution(records)
    print_docs_distribution(records)
Beispiel #5
0
from classifier_tree import TreeSingleTagWordsClassifier
from mainleave1out_biggestcategory_svm import extract_most_common_categ

if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    test_samples = int(sys.argv[6])
    
    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "test_samples =", test_samples
    
    words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(records_to_words_weights_converter.convert_records_to_words(record_read.read_list_records(records_file), words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    most_common_categ, max_cnt = extract_most_common_categ(frecords)
    print "Most common category is:", most_common_categ, " with ", max_cnt, " occurences."
    
    loo = LeaveOneOut(lambda samples: TreeSingleTagWordsClassifier(most_common_categ, samples, featurenames=words), frecords, lambda x: [int(most_common_categ in x[1])])
    corr = loo.test(test_samples)
    print "Correctness:", corr
Beispiel #6
0
if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    k = int(sys.argv[6])
    
    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "k =", k
    
    records = record_read.read_list_records(records_file)
    words = select_descriptive_words.select_descriptive_words_keywords(records)
    #words = select_descriptive_words.select_descriptive_words_quotientmethod(extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(records_to_words_weights_converter.convert_records_to_words(records, words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    most_common_categ, max_cnt = extract_most_common_categ(frecords)
    print "Most common category is:", most_common_categ, " with ", max_cnt, " occurences."
    
    kf = KFold(lambda samples: TreeSingleTagWordsClassifier(most_common_categ, samples, featurenames=words), frecords, lambda x: [int(most_common_categ in x[1])], k)
    corr = kf.test()
    print "Correctness:", corr
    for c, v in categs.iteritems():
        if v<thresh_categs_count:
            to_del.append(c)
    for c in to_del:
        categs.pop(c)
    print "len of categs after filtering rare ones:", len(categs)
    #delete rare categories from records:
    r_to_del = []    
    for rec in records:
        to_del = []
        for c in rec[category_field_name]:
            if c not in categs:
                to_del.append(c)
        for c in to_del:
            rec[category_field_name].remove(c)
        if len(rec[category_field_name])==0:
            r_to_del.append(rec)
    #delete rare records:
    print "len of records before filtering those without codes:", len(records)
    for rec in r_to_del:
        records.remove(rec)
    print "len of records after filtering those without codes:", len(records)
    return records

if __name__=="__main__":
    import sys
    sys.path.append(r'../') 
    from zbl2py import record_read, record_store
    
    records = filter_out_rare_codes_records(record_read.read_list_records(sys.argv[1]), int(sys.argv[3]))
    record_store.store_py_records(records, sys.argv[2])
Beispiel #8
0
from classifier_tester import LeaveOneOutAllCategories
from classifier_svm import SvmWordsClassifier

if __name__ == '__main__':
    #read words that are most important:
    extr_fromfname = sys.argv[1]
    basefname = sys.argv[2]
    words_count = int(sys.argv[3])
    thresh_div = float(sys.argv[4])
    records_file = sys.argv[5]
    test_samples = int(sys.argv[6])

    print "Arguments read:"
    print "extr_fromfname =", extr_fromfname
    print "basefname =", basefname
    print "words_count =", words_count
    print "thresh_div =", thresh_div
    print "records_file =", records_file
    print "test_samples =", test_samples

    words = select_descriptive_words.select_descriptive_words_quotientmethod(
        extr_fromfname, basefname, words_count, thresh_div)
    #read records and convert them into feature-vectors:
    frecords = list(
        records_to_words_weights_converter.convert_records_to_words(
            record_read.read_list_records(records_file), words))
    #create frecors with numerical etiquettes:
    #build multi-label-SVM based on this data:
    loo = LeaveOneOutAllCategories(SvmWordsClassifier, frecords)
    corr = loo.test(test_samples)
    print "Correctness:", corr
def calc_word_feats(s, words):
    """Calculate number of occurences of words in s"""
    occurences = {}
    for w in words:
        occurences[w] = 0
    
    for w in text2words.text_to_words(s):
        if w in occurences:
            occurences[w] += 1
            
    return [occurences[w] for w in words]

def convert_records_to_words(records, words):
    """Convert list of records into list of words counts"""
    for rec in records:
        try:
            title = rec['ti']
            descr = rec['descr']
            kw = " ".join(rec['kw'])
    
            feats = calc_word_feats(" ".join([title, descr, kw]), words)
            yield (feats, rec['categories'])
        except:
            continue
        

if __name__ == '__main__':
    words = select_descriptive_words.select_descriptive_words_quotientmethod(sys.argv[1], sys.argv[2], int(sys.argv[3]), float(sys.argv[4]))
    for i in convert_records_to_words(record_read.read_list_records(sys.argv[4]), words):
        print i