Beispiel #1
0
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'):
    vectors = dict()
    fin = open(file_name)
    line = fin.readline().strip().split(' ')
    count, dimention = int(line[0]), int(line[1])
    bar = progress_bar(count)
    for index in xrange(count):
        line = fin.readline()
        line = line.strip().split(' ')
        vector = map(lambda d: float(d), line[1:])
        vectors[line[0]] = vector
        bar.draw(index + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            vectors[user['_id']] = [0.] * dimention
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
Beispiel #2
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(self_training_file_dir+'test_%s.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
Beispiel #3
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #4
0
def construct_test_data(attribute):
    collection=Connection().jd.test_users
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    data=[]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        features=combine_features(user['mentions'],Counter(user['products']))
        try:
            y=user['profile'][attribute].index(1)
        except:
            continue
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]

        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        data.append([uid,y,x])
        bar.draw(index+1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w')
    for index,user in enumerate(collection.find()):
        label=0
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        features=combine_features(user['mentions_1'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
Beispiel #6
0
def validate_knn(attribute):
    print attribute
    features, labels = get_features_and_labels(attribute)
    neibor_count = 100
    f = open('./%s_knn_result.data' % attribute, 'w')
    bar = progress_bar(len(features))
    for index, uid1 in enumerate(features):
        distance = []
        for uid2 in features:
            if uid2 == uid1:
                continue
            d = get_distance(features[uid1], features[uid2])
            distance.append((uid2, d))
        distance = sorted(distance, key=lambda d: d[1],
                          reverse=True)[:neibor_count]
        label = labels[uid1]
        plabel = numpy.zeros((len(label)), dtype='float32')
        for d in distance:
            #plabel+=d[1]*labels[d[0]]
            plabel += labels[d[0]]
        if sum(plabel) == 0:
            continue
        plabel /= sum(plabel)
        if label[0] > 0:
            f.write('%d %f %f\n' % (0, plabel[0], plabel[1]))
        else:
            f.write('%d %f %f\n' % (1, plabel[0], plabel[1]))
        bar.draw(index + 1)
Beispiel #7
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    #confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    #dimention=min(len(confidence0),len(confidence1),training_count/2)
    #confidence0=confidence0#[:dimention]
    #confidence1=confidence1#[:dimention]
    print len(confidence0), len(confidence1)
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w')
    #for d in confidence0+confidence1:
    for d in confidence:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
Beispiel #8
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #9
0
def validate_knn(attribute):
    print attribute
    features,labels=get_features_and_labels(attribute)
    neibor_count=100
    f=open('./%s_knn_result.data'%attribute,'w')
    bar=progress_bar(len(features))
    for index,uid1 in enumerate(features):
        distance=[]
        for uid2 in features:
            if uid2==uid1:
                continue
            d=get_distance(features[uid1],features[uid2])
            distance.append((uid2,d))
        distance=sorted(distance,key=lambda d:d[1],reverse=True)[:neibor_count]
        label=labels[uid1]
        plabel=numpy.zeros((len(label)),dtype='float32')
        for d in distance:
            #plabel+=d[1]*labels[d[0]]
            plabel+=labels[d[0]]
        if sum(plabel)==0:
            continue
        plabel/=sum(plabel)
        if label[0]>0:
            f.write('%d %f %f\n'%(0,plabel[0],plabel[1]))
        else:
            f.write('%d %f %f\n'%(1,plabel[0],plabel[1]))
        bar.draw(index+1)
Beispiel #10
0
def age_distribute():
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    weibo_collection=Connection().jd.weibo_users
    linked_jd_ids=dict()
    ages=[]
    for line in open('/mnt/data1/adoni/data/linked_uids.data'):
        linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        if sum(user['profile']['age'])==0:
            continue
        weibo_id=linked_jd_ids[user['_id']]
        weibo_user=weibo_collection.find_one({'_id':weibo_id})
        if weibo_user==None:
            continue
        age=2015-int(weibo_user['birthday'].split(u'年')[0])
        if age>50 or age<10:
            continue
        ages.append(age)
        if age<30:
            user['profile']['age']=[1,0]
        else:
            user['profile']['age']=[0,1]
        collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}})
        bar.draw(index)
    s=sum(Counter(ages).values())
    ages=sorted(Counter(ages).items(),key=lambda d:d[0])
    ss=0.
    for age in ages:
        ss+=age[1]
        print age[0],(ss)/s
Beispiel #11
0
def construct_test_data(attribute):
    collection = Connection().jd.test_users
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    data = []
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        try:
            y = user['profile'][attribute].index(1)
        except:
            continue
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]

        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        data.append([uid, y, x])
        bar.draw(index + 1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
Beispiel #12
0
def construct_train_data():
    import random
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    data = []
    uids = get_test_uids()
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        if uid in uids:
            continue
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]
        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        y = random.randint(0, 1)
        data.append([uid, y, x])
        bar.draw(index + 1)
    output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
Beispiel #13
0
def construct_train_data():
    import random
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    data=[]
    uids=get_test_uids()
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        if uid in uids:
            continue
        features=combine_features(user['mentions'],Counter(user['products']))
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]
        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        y=random.randint(0,1)
        data.append([uid,y,x])
        bar.draw(index+1)
    output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
Beispiel #14
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #15
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data',
                      'w')
    for index, user in enumerate(collection.find()):
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        features = combine_features(user['mentions_1'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #16
0
 def train(self,data):
     f_u=dict()
     f_v=dict()
     C_u=numpy.max(numpy.sum(data.f_u_init.values(),axis=1))
     C_v=numpy.max(numpy.sum(data.f_v_init.values(),axis=1))
     for u in data.U:
         f_u[u]=data.f_u_init[u]
     for v in data.V:
         f_v[v]=data.f_v_init[v]
     bar=progress_bar(self.max_iter)
     t1=datetime.datetime.now()
     for index in xrange(self.max_iter):
         for u in data.U:
             n_u=numpy.zeros((data.vector_size))
             for v in data.U[u]:
                 n_u+=f_v[v]
             alpha=numpy.sum(data.f_u_init[u])/C_u
             f_u[u]=alpha*data.f_u_init[u]+(1.-alpha)*n_u
         for v in data.V:
             n_v=numpy.zeros((data.vector_size))
             for u in data.V[v]:
                 n_v+=f_u[u]
             alpha=numpy.sum(data.f_v_init[v])/C_v
             f_v[v]=alpha*data.f_v_init[v]+(1.-alpha)*n_v
         t2=datetime.datetime.now()
         print 'Iter: %d, minutes: %d'%(index,(t2-t1).seconds/60)
         t1=t2
         #bar.draw(index+1)
     return f_u,f_v
Beispiel #17
0
def prp_product(u_collection,p_collection,iterate):
    print 'prp product'
    pids=get_id(p_collection,iterate)
    print len(pids)
    bar=progress_bar(len(pids))
    for index,pid in enumerate(pids):
        prp_single_product(u_collection,p_collection,pid,iterate)
        bar.draw(index+1)
Beispiel #18
0
def prp_user(u_collection, p_collection, iterate):
    print 'prp user'
    uids = get_id(u_collection, iterate)
    print len(uids)
    bar = progress_bar(len(uids))
    for index, uid in enumerate(uids):
        prp_single_user(u_collection, p_collection, uid, iterate)
        bar.draw(index + 1)
Beispiel #19
0
def output_user_product_graph():
    fout = open(RAW_DATA_DIR + 'graph.data', 'w')
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)

    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)
Beispiel #20
0
def prp_product(u_collection, p_collection, iterate):
    print 'prp product'
    pids = get_id(p_collection, iterate)
    print len(pids)
    bar = progress_bar(len(pids))
    for index, pid in enumerate(pids):
        prp_single_product(u_collection, p_collection, pid, iterate)
        bar.draw(index + 1)
Beispiel #21
0
def prp_user(u_collection,p_collection,iterate):
    print 'prp user'
    uids=get_id(u_collection,iterate)
    print len(uids)
    bar=progress_bar(len(uids))
    for index,uid in enumerate(uids):
        prp_single_user(u_collection,p_collection,uid,iterate)
        bar.draw(index+1)
def get_train_user_products():
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    user_products=dict()
    for index,user in enumerate(collection.find()):
        user_products[user['_id']]=dict(Counter(user['products']))
        #user_products[user['_id']]=user['mentions']
        bar.draw(index)
    return user_products
def construct_test_set(attribute):
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #24
0
def get_train_uids():
    collection = Connection().jd.train_users
    uids = set()
    for user in collection.find():
        uids.add(user['_id'])
    collection = Connection().jd.train_users
    bar = progress_bar(len(uids))
    for index, uid in enumerate(uids):
        collection.delete_one({'_id': uid})
        bar.draw(index + 1)
Beispiel #25
0
def get_train_uids():
    collection=Connection().jd.train_users
    uids=set()
    for user in collection.find():
        uids.add(user['_id'])
    collection=Connection().jd.train_users
    bar=progress_bar(len(uids))
    for index,uid in enumerate(uids):
        collection.delete_one({'_id':uid})
        bar.draw(index+1)
Beispiel #26
0
def statistics(attribute,
               threshold=-1,
               feature_file_name=base_dir + '/features/mention.feature',
               show=False):
    import random
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    labels_distribute = [0., 0.]
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        if len(features) < 10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label] += 1  #features[f]
        labels_distribute[label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0] /= labels_distribute[0]
        distribute[f][1] /= labels_distribute[1]
    for f in distribute:
        s = sum(distribute[f])
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute = sorted(distribute.items(),
                        key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) /
                                          (sum(d[1]) + 0.1)),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
Beispiel #27
0
def construct_test_user():
    all_products = get_all_ids_from_file('product')
    collection = Connection().jd.test_users
    collection.drop()
    linked_users = Connection().jd.weibo_users
    fname = RAW_DATA_DIR + 'test_user_review.data'
    uids_with_kids = [
        line[:-1] for line in open(RAW_DATA_DIR + 'uids_with_kids.data')
    ]
    uids_without_kids = [
        line[:-1] for line in open(RAW_DATA_DIR + 'uids_without_kids.data')
    ]
    linked_uids = dict([(line[:-1].split(' ')[1], line[:-1].split(' ')[0])
                        for line in open(RAW_DATA_DIR + 'linked_uids.data')])
    prone_words = ['宝宝', '女儿', '儿子', '男朋友', '女朋友']
    f = open(fname)
    count = int(f.readline()[:-1])
    bar = progress_bar(count)
    for i in xrange(count):
        uid = f.readline()[:-1]
        products = f.readline()[:-1].split(' ')
        products = list(set(products) & all_products)
        mentions = count_mentions(f.readline())
        profile = {
            'gender': [0] * 2,
            'age': [0] * 2,
            'location': [0] * 2,
            'kids': [0] * 2,
        }
        if uid in linked_uids:
            user = linked_users.find_one({'_id': linked_uids[uid]})
            if user == None:
                pass
            else:
                profile['gender'] = user['profile']['gender']
                profile['age'] = user['profile']['age']
                profile['location'] = user['profile']['location']
        if uid in uids_with_kids:
            profile['kids'] = [0, 1]
        if uid in uids_without_kids:
            profile['kids'] = [1, 0]
        if uid in uids_without_kids or uid in uids_with_kids:
            for w in prone_words:
                if w in mentions:
                    mentions.pop(w)
        collection.insert({
            '_id': uid,
            'products': products,
            'mentions': mentions,
            'profile': profile
        })
        bar.draw(i + 1)
def construct_graph(fname, uids):
    print '==========='
    print fname
    print len(uids)
    bar = progress_bar(len(uids))
    fout = open(fname, 'w')
    index = 0
    for line in open('./remap_weibo_graph.data'):
        uid = line[0:line.find(' ')]
        if uid in uids:
            fout.write(line)
            index += 1
            bar.draw(index)
Beispiel #29
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Beispiel #30
0
def output_vector(entity_name):
    collection = Connection().jd[entity_name]
    bar = progress_bar(collection.count())
    mentions = get_mentions()
    fout = open(RAW_DATA_DIR + '%s_init_vec.data' % entity_name, 'w')
    fout.write('%d %d\n' % (collection.count(), len(mentions)))
    for index, entity in enumerate(collection.find()):
        reviews = ' '.join(set(map(lambda r: r[1], entity['records'])))
        vector = map(lambda m: reviews.count(m), mentions)
        if numpy.any(vector):
            fout.write(
                '%s %s\n' %
                (entity['_id'], ' '.join(map(lambda d: str(d), vector))))
        bar.draw(index + 1)
Beispiel #31
0
def output_features(fname, key):
    fout = open(fname, 'w')
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    features = []
    for index, user in enumerate(collection.find()):
        features += user[key]
        bar.draw(index + 1)
    features = sorted(Counter(features).items(),
                      key=lambda d: d[1],
                      reverse=True)
    fout = open('./features/review.feature', 'w')
    for f in features:
        fout.write('%s %d\n' % (f[0].encode('utf8'), f[1]))
Beispiel #32
0
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    labels=get_labels_after_train(attribute,method)
    print len(labels)
    collection=Connection().jd.train_users
    label_distribute=Counter(labels.values())
    balance_params=dict()
    for label in label_distribute:
        balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        for f in features:
            if f in distribute:
                distribute[f][label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print label_distribute
    for f in distribute:
        distribute[f][0]/=label_distribute[0]
        distribute[f][1]/=label_distribute[1]
    for f in distribute.keys():
        s=sum(distribute[f])
        if s==0:
            distribute.pop(f)
            continue
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute)
    distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
Beispiel #33
0
def construct_train_product():
    all_users = get_all_ids_from_file('user') | get_all_ids_from_file(
        'test_user')
    collection = Connection().jd.train_products
    fname = RAW_DATA_DIR + 'product_review.data'
    f = open(fname)
    count = int(f.readline()[:-1])
    bar = progress_bar(count)
    for i in xrange(count):
        pid = f.readline()[:-1]
        users = f.readline()[:-1].split(' ')
        users = list(set(users) & all_users)
        mentions = count_mentions(f.readline())
        collection.insert({'_id': pid, 'users': users, 'mentions': mentions})
        bar.draw(i + 1)
def construct_test_set(attribute):
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
Beispiel #35
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
Beispiel #36
0
def get_a_random_path_from_graph(graph, length):
    from small_utils.progress_bar import progress_bar
    path = []
    nid = weighted_random_select(graph.nodes_weight)
    path.append(nid)
    bar = progress_bar(length - 1)

    for i in xrange(length - 1):
        node = graph[nid]
        while node is None:
            nid = weighted_random_select(graph.nodes_weight)
            node = graph[nid]
        nid = weighted_random_select(node)
        path.append(nid)
        bar.draw(i)
    return path
Beispiel #37
0
def get_a_random_path_from_graph(graph, length):
    from small_utils.progress_bar import progress_bar

    path = []
    nid = weighted_random_select(graph.nodes_weight)
    path.append(nid)
    bar = progress_bar(length - 1)

    for i in xrange(length - 1):
        node = graph[nid]
        while node is None:
            nid = weighted_random_select(graph.nodes_weight)
            node = graph[nid]
        nid = weighted_random_select(node)
        path.append(nid)
        bar.draw(i)
    return path
Beispiel #38
0
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    labels_distribute=[0.,0.]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        if len(features)<10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label]+=1#features[f]
        labels_distribute[label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0]/=labels_distribute[0]
        distribute[f][1]/=labels_distribute[1]
    for f in distribute:
        s=sum(distribute[f])
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def test(attribute):
    from pymongo import Connection
    collection=Connection().jd.test_users
    bar=progress_bar(collection.count())
    labels=dict()
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        labels[user['_id']]=label
        bar.draw(index+1)
        if index>100000:
            break
    score,feature_distribute=statistics(labels,feature_file_name=base_dir+'/features/mention.feature',threshold=20)
    for f,v in sorted(score.items(),key=lambda d:d[1],reverse=True)[:50]:
        print f,'0:%0.2f 1:%0.2f'%tuple(feature_distribute[f])
    print feature_distribute[u'同学']
Beispiel #40
0
def statistics(labels,
               feature_file_name,
               threshold,
               collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention = max(labels.values()) + 1
    label_distribute = Counter(labels.values())
    label_distribute = [
        label_distribute[i] if i in label_distribute else 0
        for i in xrange(label_dimention)
    ]
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    feature_distribute = dict([f, [0.] * label_dimention]
                              for f in all_features)
    for index, user in enumerate(collection.find()):
        try:
            label = labels[user['_id']]
        except:
            continue
        features = combine_dict(user['mentions'], Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label] += 1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        if s == 0 or s < threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= label_distribute[i]

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= s
    score = dict()
    for f, v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f] = abs_score(v)
    return score, feature_distribute
Beispiel #41
0
def construct_train_user():
    from pyltp import Segmentor
    all_products = get_all_ids_from_file('product')
    collection = Connection().jd.train_users
    fname = RAW_DATA_DIR + 'user_review.data'
    f = open(fname)
    count = int(f.readline()[:-1])
    print count
    bar = progress_bar(count)
    for i in xrange(count):
        uid = f.readline()[:-1]
        products = f.readline()[:-1].split(' ')
        products = list(set(products) & all_products)
        mentions = count_mentions(f.readline())
        collection.insert({
            '_id': uid,
            'products': products,
            'mentions': mentions
        })
        bar.draw(i + 1)
Beispiel #42
0
def test(attribute):
    from pymongo import Connection
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    labels = dict()
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        labels[user['_id']] = label
        bar.draw(index + 1)
        if index > 100000:
            break
    score, feature_distribute = statistics(labels,
                                           feature_file_name=base_dir +
                                           '/features/mention.feature',
                                           threshold=20)
    for f, v in sorted(score.items(), key=lambda d: d[1], reverse=True)[:50]:
        print f, '0:%0.2f 1:%0.2f' % tuple(feature_distribute[f])
    print feature_distribute[u'同学']
Beispiel #43
0
def analyze_feature_count(attribute):
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    bar=progress_bar(collection.count())
    x=[]
    y=[]
    labels=[]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        labels.append(label)
        x.append(len(user['products']))
        y.append(len(user['mentions'].values()))
        bar.draw(index)
    f=open('./tmp.data','w')
    for i in xrange(len(labels)):
        f.write('%d %d %d\n'%(labels[i],x[i],y[i]))
    print Counter(labels)
Beispiel #44
0
def insert_review(collection, fname):
    from collections import Counter
    from pyltp import Segmentor
    f = open(fname)
    count = int(f.readline()[:-1])
    print count
    segmentor = Segmentor()
    segmentor.load('/home/adoni/cws.model')
    bar = progress_bar(count)
    review_count = 0
    for i in xrange(count):
        uid = f.readline()[:-1]
        products = f.readline()
        review = f.readline()[:-1].replace('|&|', ' ')
        review_count += len(review)
        continue
        review = [w for w in segmentor.segment(review)]
        collection.update({'_id': uid}, {'$set': {
            'review': review
        }},
                          safe=True)
        bar.draw(i + 1)
    print review_count
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention=max(labels.values())+1
    label_distribute=Counter(labels.values())
    label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    feature_distribute=dict([f,[0.]*label_dimention] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        features=combine_dict(user['mentions'],Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label]+=1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        if s==0 or s<threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=label_distribute[i]

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=s
    score=dict()
    for f,v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f]=abs_score(v)
    return score,feature_distribute
def remove_surrounding_nodes(fname):
    print 'Start'
    uids = [line.split(' ')[0] for line in open(fname)]
    uids = set(uids)
    if '/' in fname:
        out_file_name = '/'.join(
            fname.split(
                '/'
            )[:-1]
        ) + '/cleaned_' + fname.split('/')[-1]
    else:
        out_file_name = 'cleaned_' + fname
    fout = open(out_file_name, 'w')
    bar = progress_bar(len(uids))
    for index, line in enumerate(open(fname)):
        #bar.draw(index+1)
        if index % 10000 == 0:
            bar.draw(index + 1)

        line = line.strip().split(' ')
        line = filter(lambda uid: uid in uids, line)
        if len(line) <= 1:
            continue
        fout.write(' '.join(line) + '\n')
Beispiel #47
0
def construct_train_set(labeled_features, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter(user['products'])).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    print len(confidence0), len(confidence1)

    if len(confidence0) == 0 or len(confidence1) == 0:
        return False
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        labeled_train_data.write('%d %s\n' % (d[1], d[3]))

    unlabeled_train_data = StringIO.StringIO()
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        unlabeled_train_data.write('%d %s\n' % (d[1], d[3]))
    return True
Beispiel #48
0
def construct_train_set(labeled_features,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users

    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        label_distributed=[1,1]
        for f,value in combine_features(user['mentions'],Counter(user['products'])).items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        if not s==0:
            label_distributed[0]/=s
            label_distributed[1]/=s
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature))
        confidence.append(
                (user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    confidence0=filter(lambda d:d[1]==0,confidence)
    confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True)
    confidence1=filter(lambda d:d[1]==1,confidence)
    confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True)
    confidence2=filter(lambda d:d[1]==-1,confidence)
    confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True)

    dimention=min(len(confidence0),len(confidence1),training_count/2)
    confidence0=confidence0[:dimention]
    confidence1=confidence1[:dimention]
    confidence2=confidence2[:dimention]

    print len(confidence0),len(confidence1)

    if len(confidence0)==0 or len(confidence1)==0:
        return False
    labeled_train_data=open(RAW_DATA_DIR+'multi_clf/labeled_train.data','w')
    for d in confidence0+confidence1:
        labeled_train_data.write('%d %s\n'%(d[1],d[3]))

    unlabeled_train_data=StringIO.StringIO()
    labeled_train_data=open(RAW_DATA_DIR+'multi_clf/unlabeled_train.data','w')
    for d in confidence0+confidence1:
        unlabeled_train_data.write('%d %s\n'%(d[1],d[3]))
    return True
def construct_test_set(attribute):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)

    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print 'Balance params: ',balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        #if random.random()>balance_params[label]:
        #    continue

        '============'
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))

        fout.write('%d %s\n'%(label,str_x))
        uid_output.write('%s\n'%(user['_id']))
        bar.draw(index+1)
def construct_train_set(attribute,training_count):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)
    test_uids=get_test_uids()

    labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)
    label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    guess=[]
    for index,user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features=combine_dict(user['mentions_0'],Counter('products'))
        label,confidence=label_arbiter.arbitrate_label(features)
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))
        guess.append(
                (user['_id'],
                    label,
                    abs(confidence),
                    str_x,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    data0=filter(lambda d:d[1]==0,guess)
    data0=sorted(data0,key=lambda d:d[2],reverse=True)
    data1=filter(lambda d:d[1]==1,guess)
    data1=sorted(data1,key=lambda d:d[2],reverse=True)
    data2=filter(lambda d:d[1]==-1,guess)
    data2=sorted(data2,key=lambda d:d[4],reverse=True)

    dimention=min(len(data0),len(data1),training_count/2)

    data0=data0[:dimention]
    data1=data1[:dimention]
    data2=data2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in data0+data1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in data2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
def construct_train_set(attribute,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    labeled_features=dict()
    for line in labeled_feature_file:
        line=line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:])
    collection=Connection().jd.train_users

    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        label_distributed=[1,1]
        for f,value in combine_features(user['mentions'],Counter('products')).items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        if not s==0:
            label_distributed[0]/=s
            label_distributed[1]/=s
        label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products')))
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature))
        confidence.append(
                (user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    confidence0=filter(lambda d:d[1]==0,confidence)
    confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True)
    confidence1=filter(lambda d:d[1]==1,confidence)
    confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True)
    confidence2=filter(lambda d:d[1]==-1,confidence)
    confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True)

    dimention=min(len(confidence0),len(confidence1),training_count/2)
    confidence0=confidence0[:dimention]
    confidence1=confidence1[:dimention]
    confidence2=confidence2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in confidence0+confidence1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in confidence2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
Beispiel #52
0
def construct_train_set(attribute,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    labeled_features=dict()
    for line in labeled_feature_file:
        line=line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:])

    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        label_distributed=[1,1]
        for f,value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        label_distributed[0]/=s
        label_distributed[1]/=s
        #print label_distributed
        #if abs(label_distributed[0]-label_distributed[1])<0.5:
        #    continue
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%d'%f,sorted_feature))
        confidence.append((
                    user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features
                    ))
        bar.draw(index+1)

    confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0=filter(lambda d:d[1]==0,confidence)[:training_count/2]
    confidence1=filter(lambda d:d[1]==1,confidence)[:training_count/2]
    confidence_unlabel=[]
    confidence_unlabel+=filter(lambda d:d[1]==-1,confidence)
    #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5]
    #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5]
    confidence_unlabel=confidence_unlabel[:5*training_count]
    print len(confidence0),len(confidence1)
    fout=open(self_training_file_dir+'labeled_train_%s.data'%attribute,'w')
    for d in set(confidence0+confidence1):
        fout.write('%d %s\n'%(d[1],d[3]))
    fout_unlabel=open(self_training_file_dir+'unlabeled_train_%s.data'%attribute,'w')
    for d in confidence_unlabel:
        fout_unlabel.write('%d %s\n'%(d[1],d[3]))