Esempio n. 1
0
    parser.add_argument("-w", "--wait", metavar="SECS", dest="wait", type=float, default=0.2, help="")
    parser.add_argument("-b", "--bits", metavar="N", dest="bits", type=int, default=15, help="")
    parser.add_argument(
        "-d", "--database", metavar="NAME", dest="database", type=unicode, default="wikisentiment", help=""
    )
    parser.add_argument(
        "-H", "--hosts", metavar="HOSTS", dest="hosts", type=str, default="alpha,beta", help="MongoDB hosts"
    )
    parser.add_argument(
        "-v", "--verbose", dest="verbose", action="store_true", default=False, help="turn on verbose message output"
    )
    options = parser.parse_args()

    # establish MongoDB connection
    collection = myutils.get_mongodb_collection(options.hosts, options.database)

    # for each 'entry' in the MongoDB, extract features and put them to 'features'
    db = collection["talkpage_diffs_raw"]
    cursor = db.find()
    entries = []
    for ent in cursor:
        features = extract_features(ent)
        vector = myutils.map_key_dict(unicode, extract_vector(features, options.bits))
        # print db.find({'entry.rev_id': ent['entry']['rev_id']}).count()#!
        # print vector,features,ent#!
        ent["vector"] = vector
        ent["features"] = features
        ret = db.save(ent, safe=True)
        if options.verbose:
            print ent["entry"]["id"]
Esempio n. 2
0
                        help='turn on verbose message output')
    options = parser.parse_args()

    # establish MongoDB connection
    collection = myutils.get_mongodb_collection(options.hosts, options.database)

    # load models for each label
    models = test.load_models(collection['models'], ast.literal_eval(options.model))

    cursor = myutils.get_mysql_connection(options.host, options.db).cursor()
    # contruct the testing set from the MediaWiki table
    vectors = []
    for ent in wikilove_revs.get_entries(cursor, options.start, options.end, options.window, options.limit, newest=True):
        features = extract_features.extract_features({'entry': {'content': {'added': [ent.others.message], 'removed':[]},
                                                                'comment': ''}})
        vector = myutils.map_key_dict(int, extract_features.extract_vector(features, options.bits))
        if ent.receiver_id != ent.sender_id:
            vectors.append(myutils.entry_t(ent, features, vector))

    labels = sorted(models.keys())
    
    vecs = [x.vector for x in vectors]
    predictions = [[[] for y in xrange(0, len(labels))] for x in xrange(0,len(vectors))]
    for (n,lname) in enumerate(labels):
        lab,_,val = liblinear.linearutil.predict([0]*len(vecs), vecs, models[lname], '-b 1')
        for (i,(pred,score)) in enumerate(zip(lab,val)):
            predictions[i][n] = score[1] # get the confidence for the label being 'True'

    print >>options.output, '<style type="text/css">.prediction{text-align: right;} td{vertical-align: top;} li{border: 1px solid; list-style: none inside; margin: 0.2em;} ul{padding: 0;} blockquote{ font: normal italic  100% serif; }</style>'
    print >>options.output, '<body style="background: #EEE;">Generated at %s.' % str(datetime.now())
    print >>options.output, '<table style="background: white; width: 100%"><tr>'
Esempio n. 3
0
    query = {'vector': {'$exists': True}}
    query.update(ast.literal_eval(options.find))
    cursor = db.find(query)
    print >>sys.stderr, 'labeld examples: %s out of %s' % (cursor.count(), db.count())

    vectors = []
    labels = {}
    for x in models.keys():
        labels[x] = []
    for ent in cursor:
        for name in labels.keys():
            value = None
            if ent.has_key('labels') and ent['labels'].has_key(name):
                value = ent['labels'][name] if 1 else -1
            labels.setdefault(name, []).append(value)
        vectors.append(entry_t(ent['entry'], ent['features'], myutils.map_key_dict(int, ent['vector'])))

    for (name,vals) in labels.items():
        assert len(vectors) == len(vals), [len(vectors), len(vals), name]

    labels = sorted(labels.items(), key=lambda x: x[0])

    writer = csv.writer(options.output, delimiter='\t')
    if options.aggregate:
        writer.writerow([unicode(x) for x in ['id'] + [x[0] for x in labels] + ['diff', 'snippet']])
    else:
        writer.writerow([unicode(x) for x in ['id', 'predicted', 'coded', 'confidence', 'correct?', 'diff', 'snippet']])
    vecs = map(lambda x: x.vector, vectors)
    output = {}
    for (lname, labs) in labels:
        m = models[lname]
Esempio n. 4
0
    # contruct the training set from 'entry's in the MongoDB
    db = collection['talkpage_diffs_raw']
    query = {'labels': {'$exists': True},
             'vector': {'$exists': True}}
    query.update(ast.literal_eval(options.find))
    cursor = db.find(query)
    print >>sys.stderr, 'using labeld examples: %s out of %s' % (cursor.count(), db.count())
    labels = {}
    vectors = []
    entries = []
    for ent in cursor:
        if not ent.has_key('labels'):
            print >>sys.stderr, 'skip %s' % ent['entry']['id']
            continue
        vec = myutils.map_key_dict(int, ent['vector'])
        if len(vec.items()) == 0:
            print >>sys.stderr, 'empty %s' % ent['entry']['id']
            #continue
        vectors.append(vec)
        entries.append(ent)
        for (name,value) in ent['labels'].items():
            labels.setdefault(name, []).append(value if 1 else -1)
        if options.verbose:
            print >>sys.stderr, str(ent['entry']['id'])

    if options.verbose:
        print >>sys.stderr, 'vectors loaded %s' % len(vectors)

    for (name,vals) in labels.items():
        assert len(vectors) == len(vals), [len(vectors), len(vals), name]