query = {'vector': {'$exists': True}} query.update(ast.literal_eval(options.find)) cursor = db.find(query) print >>sys.stderr, 'labeld examples: %s out of %s' % (cursor.count(), db.count()) vectors = [] labels = {} for x in models.keys(): labels[x] = [] for ent in cursor: for name in labels.keys(): value = None if ent.has_key('labels') and ent['labels'].has_key(name): value = ent['labels'][name] if 1 else -1 labels.setdefault(name, []).append(value) vectors.append(entry_t(ent['entry'], ent['features'], myutils.map_key_dict(int, ent['vector']))) for (name,vals) in labels.items(): assert len(vectors) == len(vals), [len(vectors), len(vals), name] labels = sorted(labels.items(), key=lambda x: x[0]) writer = csv.writer(options.output, delimiter='\t') if options.aggregate: writer.writerow([unicode(x) for x in ['id'] + [x[0] for x in labels] + ['diff', 'snippet']]) else: writer.writerow([unicode(x) for x in ['id', 'predicted', 'coded', 'confidence', 'correct?', 'diff', 'snippet']]) vecs = map(lambda x: x.vector, vectors) output = {} for (lname, labs) in labels: m = models[lname]
# establish MongoDB connection collection = myutils.get_mongodb_collection(options.hosts, options.database) # load models for each label models = test.load_models(collection['models'], ast.literal_eval(options.model)) cursor = myutils.get_mysql_connection(options.host, options.db).cursor() # contruct the testing set from the MediaWiki table vectors = [] for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True): features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]}, 'comment': ''}}) vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits)) if ent.receiver_id != ent.sender_id: vectors.append(myutils.entry_t(ent, features, vector)) labels = sorted(models.keys()) vecs = [x.vector for x in vectors] predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))] for (n,lname) in enumerate(labels): lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1') for (i,(pred,score)) in enumerate(zip(lab,val)): predictions[i][n] = score[1] # get the confidence for the label being 'True' print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic 100% serif; }</style>' print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now()) print >>options.output, '<table style="background: white; width: 100%"><tr>' for (i,x) in enumerate(labels): print >>options.output, '<th>%s: %d out of %d (>%f)</th>' % (x, len(filter(lambda x: x[i] > options.threshold, predictions)), len(predictions), options.threshold)