def process(options, trainCollection, annotationfile, feature, modelName): assert(modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#')] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i%numjobs==(job-1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile(os.path.join(rootpath,trainCollection,'FeatureData',feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames)+1): names,labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t-1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names,labels)) renamed,vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1== lab]) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def learn(concept, params): rootpath = params['rootpath'] trainCollection = params['trainCollection'] baseAnnotationName = params['baseAnnotationName'] startAnnotationName = params['startAnnotationName'] strategy = params['strategy'] feature = params['feature'] feat_file = params['feat_file'] feat_dim = feat_file.ndims npr = params['npr'] iterations = params['iterations'] beta = 0.5 names,labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath) positive_bag = [x[0] for x in zip(names,labels) if x[1] > 0] negative_bag = [x[0] for x in zip(names,labels) if x[1] < 0] names,labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath) negative_pool = [x[0] for x in zip(names,labels) if x[1] < 0] Usize = max(5000, len(positive_bag) * npr) Usize = min(10000, Usize) Usize = min(Usize, len(negative_pool)) new_model = None for t in range(1, iterations+1): printStatus(INFO, 'iter %d (%s)' % (t, concept)) if t > 1: # select relevant negative examples # check how good at classifying positive training examples results = classify_large_data(assemble_model, positive_bag, feat_file) pos_error_rate = len([1 for x in results if x[1]<0])/float(len(results)) U = random.sample(negative_pool, Usize) predictions = classify_large_data(assemble_model, U, feat_file) neg_error_rate = len([1 for x in predictions if x[1]>0])/float(len(predictions)) error_rate = (pos_error_rate + neg_error_rate)/2.0 printStatus(INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], predictions[0][0], predictions[0][1], pos_error_rate, neg_error_rate, error_rate)) if error_rate < MIN_ERROR_RATE: printStatus(INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE)) break # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top # so ignore them nr_of_estimated_pos = int(len(predictions)*0.01) negative_bag = NegativeBootstrap.sampling(predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag))) new_names = positive_bag + negative_bag new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag) name2label = dict(zip(new_names,new_labels)) renamed, vectors = feat_file.read(new_names) Ys = [name2label[x] for x in renamed] np = len([1 for y in Ys if y>0]) nn = len([1 for y in Ys if y<0]) assert(len(positive_bag) == np) assert(len(negative_bag) == nn) wp = float(beta) * (np+nn) / np wn = (1.0-beta) * (np+nn) /nn C = 1 svm_params = '-w1 %g -w-1 %g' % (wp*C, wn*C) if 'fik' == params['model']: svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1-1.0/t, 1.0/t) return assemble_model
def process(options, trainCollection, annotationfile, feature, modelName): assert (modelName in ['fik', 'fastlinear']) rootpath = options.rootpath autoweight = 1 #options.autoweight beta = 0.5 C = 1 overwrite = options.overwrite numjobs = options.numjobs job = options.job params = {'rootpath': rootpath, 'model': modelName} if 'fik' == modelName: from svms.fiksvm.svmutil import svm_train as train_model from svms.fiksvm.fiksvm import svm_to_fiksvm as compress_model from svms.fiksvm.fiksvm import fiksvm_save_model as save_model from svms.fiksvm.svm import KERNEL_TYPE nr_bins = options.nr_bins modelName += str(nr_bins) params['nr_bins'] = nr_bins minmax_file = os.path.join(rootpath, trainCollection, 'FeatureData', feature, 'minmax.txt') with open(minmax_file, 'r') as f: params['min_vals'] = map(float, str.split(f.readline())) params['max_vals'] = map(float, str.split(f.readline())) else: from svms.fastlinear.liblinear193.python.liblinearutil import train as train_model from svms.fastlinear.fastlinear import liblinear_to_fastlinear as compress_model from svms.fastlinear.fastlinear import fastlinear_save_model as save_model newAnnotationName = os.path.split(annotationfile)[-1] trainAnnotationNames = [ x.strip() for x in open(annotationfile).readlines() if x.strip() and not x.strip().startswith('#') ] for annotationName in trainAnnotationNames: conceptfile = os.path.join(rootpath, trainCollection, 'Annotations', annotationName) if not os.path.exists(conceptfile): print '%s does not exist' % conceptfile return 0 concepts = readConcepts(trainCollection, trainAnnotationNames[0], rootpath=rootpath) resultdir = os.path.join(rootpath, trainCollection, 'Models', newAnnotationName, feature, modelName) todo = [] for concept in concepts: resultfile = os.path.join(resultdir, concept + '.model') if not checkToSkip(resultfile, overwrite): todo.append(concept) todo = [todo[i] for i in range(len(todo)) if i % numjobs == (job - 1)] printStatus(INFO, 'to process %d concepts: %s' % (len(todo), ' '.join(todo))) if not todo: return 0 train_feat_file = BigFile( os.path.join(rootpath, trainCollection, 'FeatureData', feature)) feat_dim = train_feat_file.ndims s_time = time.time() for concept in todo: assemble_model = None for t in range(1, len(trainAnnotationNames) + 1): names, labels = readAnnotationsFrom(trainCollection, trainAnnotationNames[t - 1], concept, skip_0=True, rootpath=rootpath) name2label = dict(zip(names, labels)) renamed, vectors = train_feat_file.read(names) Ys = [name2label[x] for x in renamed] np = len([1 for lab in labels if 1 == lab]) nn = len([1 for lab in labels if -1 == lab]) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn if autoweight: svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) else: svm_params = '-c %g' % C if modelName.startswith('fik'): svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2 -B -1 ' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) new_model_file = os.path.join(resultdir, '%s.model' % concept) makedirsforfile(new_model_file) printStatus(INFO, 'save model to %s' % new_model_file) save_model(new_model_file, assemble_model) printStatus(INFO, '%s done' % concept) timecost = time.time() - s_time writeConceptsTo(concepts, trainCollection, newAnnotationName, rootpath) printStatus(INFO, 'done for %g concepts: %s' % (len(todo), ' '.join(todo))) printStatus(INFO, 'models stored at %s' % resultdir) printStatus(INFO, '%g seconds in total' % timecost)
def learn(concept, params): rootpath = params['rootpath'] trainCollection = params['trainCollection'] baseAnnotationName = params['baseAnnotationName'] startAnnotationName = params['startAnnotationName'] strategy = params['strategy'] feature = params['feature'] feat_file = params['feat_file'] feat_dim = feat_file.ndims npr = params['npr'] iterations = params['iterations'] beta = 0.5 names, labels = readAnnotationsFrom(trainCollection, startAnnotationName, concept, skip_0=True, rootpath=rootpath) positive_bag = [x[0] for x in zip(names, labels) if x[1] > 0] negative_bag = [x[0] for x in zip(names, labels) if x[1] < 0] names, labels = readAnnotationsFrom(trainCollection, baseAnnotationName, concept, skip_0=True, rootpath=rootpath) negative_pool = [x[0] for x in zip(names, labels) if x[1] < 0] Usize = max(5000, len(positive_bag) * npr) Usize = min(10000, Usize) Usize = min(Usize, len(negative_pool)) new_model = None for t in range(1, iterations + 1): printStatus(INFO, 'iter %d (%s)' % (t, concept)) if t > 1: # select relevant negative examples # check how good at classifying positive training examples results = classify_large_data(assemble_model, positive_bag, feat_file) pos_error_rate = len([1 for x in results if x[1] < 0]) / float( len(results)) U = random.sample(negative_pool, Usize) predictions = classify_large_data(assemble_model, U, feat_file) neg_error_rate = len([1 for x in predictions if x[1] > 0 ]) / float(len(predictions)) error_rate = (pos_error_rate + neg_error_rate) / 2.0 printStatus( INFO, 'iter %d: %s %.3f -> %s %.3f, pe=%.3f, ne=%.3f, error=%.3f' % (t, predictions[-1][0], predictions[-1][1], predictions[0][0], predictions[0][1], pos_error_rate, neg_error_rate, error_rate)) if error_rate < MIN_ERROR_RATE: printStatus( INFO, 'hit stop criteria: error (%.3f) < MIN_ERROR_RATE (%.3f)' % (error_rate, MIN_ERROR_RATE)) break # assume that 1% of the randomly sampled set is truely positive, and the classifier will rank them at the top # so ignore them nr_of_estimated_pos = int(len(predictions) * 0.01) negative_bag = NegativeBootstrap.sampling( predictions[nr_of_estimated_pos:], strategy, max(1000, len(positive_bag))) new_names = positive_bag + negative_bag new_labels = [1] * len(positive_bag) + [-1] * len(negative_bag) name2label = dict(zip(new_names, new_labels)) renamed, vectors = feat_file.read(new_names) Ys = [name2label[x] for x in renamed] np = len([1 for y in Ys if y > 0]) nn = len([1 for y in Ys if y < 0]) assert (len(positive_bag) == np) assert (len(negative_bag) == nn) wp = float(beta) * (np + nn) / np wn = (1.0 - beta) * (np + nn) / nn C = 1 svm_params = '-w1 %g -w-1 %g' % (wp * C, wn * C) if 'fik' == params['model']: svm_params += ' -s 0 -t %d' % KERNEL_TYPE.index("HI") else: svm_params += ' -s 2' g_t = train_model(Ys, vectors, svm_params + ' -q') if t == 1: assemble_model = compress_model([g_t], [1.0], feat_dim, params) else: new_model = compress_model([g_t], [1.0], feat_dim, params) assemble_model.add_fastsvm(new_model, 1 - 1.0 / t, 1.0 / t) return assemble_model