def get_feature_avg(svm_in): l_svm_data = load_svm_feature(svm_in) h_rel_feature = {} h_irrel_feature = {} rel_cnt = 0 irrel_cnt = 0 for data in l_svm_data: label = data['score'] h_feature = data['feature'] for key, score in h_feature.items(): if score < -20: score = 0 else: score = math.exp(score) h_feature[key] = score if label > 0: h_rel_feature = add_svm_feature(h_rel_feature, h_feature) rel_cnt += 1 else: h_irrel_feature = add_svm_feature(h_irrel_feature, h_feature) irrel_cnt += 1 rel_cnt = float(rel_cnt) irrel_cnt = float(irrel_cnt) for key in h_rel_feature: h_rel_feature[key] /= rel_cnt for key in h_irrel_feature: h_irrel_feature[key] /= irrel_cnt return h_rel_feature, h_irrel_feature
def _load_svm_ltr_feature(self): logging.info('loading svm ltr feature [%s]', self.ltr_f_in) l_svm_data = load_svm_feature(self.ltr_f_in) for svm_data in l_svm_data: qid, docno, h_feature = svm_data['qid'], svm_data[ 'comment'], svm_data['feature'] self.h_qid_docno_ltr_feature[qid + '\t' + docno] = h_feature logging.info('loaded [%d] pairs of pre extracted ltr feature', len(self.h_qid_docno_ltr_feature))
def load_multiple_svm_and_feature(svm_files_in): l_name_fields = open(svm_files_in).read().splitlines() l_names = [line.split('\t')[0] for line in l_name_fields] # l_fields = [line.split('\t')[1] for line in l_name_fields] ll_svm_data = [load_svm_feature(name) for name in l_names] l_h_feature_name = [ json.load(open(name + '_name.json')) for name in l_names ] l_new_h_feature_name = [] for h_feature_name in l_h_feature_name: l_new_h_feature_name.append(h_feature_name) return ll_svm_data, l_new_h_feature_name
import sys if 4 != len(sys.argv): print "convert svm feature to csv format" print "3 para: svm in + feature name in + out" sys.exit(-1) h_feature_name = json.load(open(sys.argv[2])) l_feature_name = h_feature_name.items() l_feature_name.sort(key=lambda item: item[1]) l_feature_name = [item[0] for item in l_feature_name] head_str = 'qid,docno,label,' + ','.join(l_feature_name) out = open(sys.argv[3], 'w') print >> out, head_str l_svm_data = load_svm_feature(sys.argv[1]) for svm_data in l_svm_data: line = svm_data['qid'] + ',' + svm_data[ 'comment'] + ',%d,' % svm_data['score'] h_feature = svm_data['feature'] l_feature = h_feature.items() l_feature.sort(key=lambda item: item[0]) l_feature_value = ['%.4f' % item[1] for item in l_feature] line += ','.join(l_feature_value) print >> out, line out.close() print "finished"
from knowledge4ir.utils import load_svm_feature, dump_svm_feature import sys if 3 != len(sys.argv): print "svm in + out" sys.exit() l = load_svm_feature(sys.argv[1]) dump_svm_feature(l, sys.argv[2])