parser.add_argument("-w", "--wait", metavar="SECS", dest="wait", type=float, default=0.2, help="") parser.add_argument("-b", "--bits", metavar="N", dest="bits", type=int, default=15, help="") parser.add_argument( "-d", "--database", metavar="NAME", dest="database", type=unicode, default="wikisentiment", help="" ) parser.add_argument( "-H", "--hosts", metavar="HOSTS", dest="hosts", type=str, default="alpha,beta", help="MongoDB hosts" ) parser.add_argument( "-v", "--verbose", dest="verbose", action="store_true", default=False, help="turn on verbose message output" ) options = parser.parse_args() # establish MongoDB connection collection = myutils.get_mongodb_collection(options.hosts, options.database) # for each 'entry' in the MongoDB, extract features and put them to 'features' db = collection["talkpage_diffs_raw"] cursor = db.find() entries = [] for ent in cursor: features = extract_features(ent) vector = myutils.map_key_dict(unicode, extract_vector(features, options.bits)) # print db.find({'entry.rev_id': ent['entry']['rev_id']}).count()#! # print vector,features,ent#! ent["vector"] = vector ent["features"] = features ret = db.save(ent, safe=True) if options.verbose: print ent["entry"]["id"]
help='turn on verbose message output') options = parser.parse_args() # establish MongoDB connection collection = myutils.get_mongodb_collection(options.hosts, options.database) # load models for each label models = test.load_models(collection['models'], ast.literal_eval(options.model)) cursor = myutils.get_mysql_connection(options.host, options.db).cursor() # contruct the testing set from the MediaWiki table vectors = [] for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True): features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]}, 'comment': ''}}) vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits)) if ent.receiver_id != ent.sender_id: vectors.append(myutils.entry_t(ent, features, vector)) labels = sorted(models.keys()) vecs = [x.vector for x in vectors] predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))] for (n,lname) in enumerate(labels): lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1') for (i,(pred,score)) in enumerate(zip(lab,val)): predictions[i][n] = score[1] # get the confidence for the label being 'True' print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic 100% serif; }</style>' print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now()) print >>options.output, '<table style="background: white; width: 100%"><tr>'
query = {'vector': {'$exists': True}} query.update(ast.literal_eval(options.find)) cursor = db.find(query) print >>sys.stderr, 'labeld examples: %s out of %s' % (cursor.count(), db.count()) vectors = [] labels = {} for x in models.keys(): labels[x] = [] for ent in cursor: for name in labels.keys(): value = None if ent.has_key('labels') and ent['labels'].has_key(name): value = ent['labels'][name] if 1 else -1 labels.setdefault(name, []).append(value) vectors.append(entry_t(ent['entry'], ent['features'], myutils.map_key_dict(int, ent['vector']))) for (name,vals) in labels.items(): assert len(vectors) == len(vals), [len(vectors), len(vals), name] labels = sorted(labels.items(), key=lambda x: x[0]) writer = csv.writer(options.output, delimiter='\t') if options.aggregate: writer.writerow([unicode(x) for x in ['id'] + [x[0] for x in labels] + ['diff', 'snippet']]) else: writer.writerow([unicode(x) for x in ['id', 'predicted', 'coded', 'confidence', 'correct?', 'diff', 'snippet']]) vecs = map(lambda x: x.vector, vectors) output = {} for (lname, labs) in labels: m = models[lname]
# contruct the training set from 'entry's in the MongoDB db = collection['talkpage_diffs_raw'] query = {'labels': {'$exists': True}, 'vector': {'$exists': True}} query.update(ast.literal_eval(options.find)) cursor = db.find(query) print >>sys.stderr, 'using labeld examples: %s out of %s' % (cursor.count(), db.count()) labels = {} vectors = [] entries = [] for ent in cursor: if not ent.has_key('labels'): print >>sys.stderr, 'skip %s' % ent['entry']['id'] continue vec = myutils.map_key_dict(int, ent['vector']) if len(vec.items()) == 0: print >>sys.stderr, 'empty %s' % ent['entry']['id'] #continue vectors.append(vec) entries.append(ent) for (name,value) in ent['labels'].items(): labels.setdefault(name, []).append(value if 1 else -1) if options.verbose: print >>sys.stderr, str(ent['entry']['id']) if options.verbose: print >>sys.stderr, 'vectors loaded %s' % len(vectors) for (name,vals) in labels.items(): assert len(vectors) == len(vals), [len(vectors), len(vals), name]