def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data',
                      'w')
    for index, user in enumerate(collection.find()):
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        features = combine_features(user['mentions_1'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
def construct_train_data():
    import random
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    data = []
    uids = get_test_uids()
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        if uid in uids:
            continue
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]
        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        y = random.randint(0, 1)
        data.append([uid, y, x])
        bar.draw(index + 1)
    output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
def construct_test_data(attribute):
    collection = Connection().jd.test_users
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    data = []
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        try:
            y = user['profile'][attribute].index(1)
        except:
            continue
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]

        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        data.append([uid, y, x])
        bar.draw(index + 1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w')
    for index,user in enumerate(collection.find()):
        label=0
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        features=combine_features(user['mentions_1'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
def construct_train_data():
    import random
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    data=[]
    uids=get_test_uids()
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        if uid in uids:
            continue
        features=combine_features(user['mentions'],Counter(user['products']))
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]
        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        y=random.randint(0,1)
        data.append([uid,y,x])
        bar.draw(index+1)
    output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def construct_test_data(attribute):
    collection=Connection().jd.test_users
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    data=[]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        features=combine_features(user['mentions'],Counter(user['products']))
        try:
            y=user['profile'][attribute].index(1)
        except:
            continue
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]

        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        data.append([uid,y,x])
        bar.draw(index+1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_test_set(attribute):
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Exemple #8
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
def construct_test_set(attribute):
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
Exemple #11
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Exemple #12
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    #confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    #dimention=min(len(confidence0),len(confidence1),training_count/2)
    #confidence0=confidence0#[:dimention]
    #confidence1=confidence1#[:dimention]
    print len(confidence0), len(confidence1)
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w')
    #for d in confidence0+confidence1:
    for d in confidence:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
Exemple #13
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Exemple #14
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Exemple #15
0
def compair_single(attribute, method):
    d1 = statistics(attribute,
                    threshold=50,
                    feature_file_name=base_dir +
                    '/features/all_features.feature')
    d2 = statistics_after_train(attribute,
                                method,
                                feature_file_name=base_dir +
                                '/features/all_features.feature')
    result = []
    labeled_features = [
        line.split(' ')[0].decode('utf8')
        for line in open(labeled_feature_file_dir +
                         'review_constraint_%s.constraints' % attribute)
    ]
    all_features = get_features(feature_file_name=base_dir +
                                '/features/all_features.feature')
    print '\n======%s======' % attribute
    for f in labeled_features:
        print f
        if f in d1:
            #print d1[f]
            print '%0.2f , %0.2f' % (1. * d1[f][0] / sum(d1[f]),
                                     1. * d1[f][1] / sum(d1[f]))
        if f in d2:
            #print d2[f]
            print '%0.2f , %0.2f' % (1. * d2[f][0] / sum(d2[f]),
                                     1. * d2[f][1] / sum(d2[f]))
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(self_training_file_dir+'test_%s.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
Exemple #17
0
def statistics(attribute,
               threshold=-1,
               feature_file_name=base_dir + '/features/mention.feature',
               show=False):
    import random
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    labels_distribute = [0., 0.]
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        if len(features) < 10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label] += 1  #features[f]
        labels_distribute[label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0] /= labels_distribute[0]
        distribute[f][1] /= labels_distribute[1]
    for f in distribute:
        s = sum(distribute[f])
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute = sorted(distribute.items(),
                        key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) /
                                          (sum(d[1]) + 0.1)),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
Exemple #18
0
def feature_based(args, valid=False):
    img_l = tools.load_image(args.left_image, 0)
    img_r = tools.load_image(args.right_image, 0)
    if valid:
        img_l, img_r = img_r, img_l
    feat_l = tools.get_features(img_l)
    feat_r = tools.get_features(img_r)
    disparity = np.zeros(img_l.shape).astype(np.int)
    disparity = functions.GetDisparity_feature(img_l,
                                               img_r,
                                               args.kernel_size,
                                               feat_l,
                                               feat_r,
                                               args.measure,
                                               disparity,
                                               valid=valid)
    disparity = cv2.filter2D(disparity.astype(np.float32), -1,
                             tools.mean_kernel2D)
    print
    return disparity.astype(np.int)
Exemple #19
0
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    labels=get_labels_after_train(attribute,method)
    print len(labels)
    collection=Connection().jd.train_users
    label_distribute=Counter(labels.values())
    balance_params=dict()
    for label in label_distribute:
        balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        for f in features:
            if f in distribute:
                distribute[f][label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print label_distribute
    for f in distribute:
        distribute[f][0]/=label_distribute[0]
        distribute[f][1]/=label_distribute[1]
    for f in distribute.keys():
        s=sum(distribute[f])
        if s==0:
            distribute.pop(f)
            continue
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute)
    distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
Exemple #20
0
def compair_single(attribute,method):
    d1=statistics(attribute,threshold=50,feature_file_name=base_dir+'/features/all_features.feature')
    d2=statistics_after_train(attribute,method,feature_file_name=base_dir+'/features/all_features.feature')
    result=[]
    labeled_features=[line.split(' ')[0].decode('utf8') for line in
            open(labeled_feature_file_dir+'review_constraint_%s.constraints'%attribute)]
    all_features=get_features(feature_file_name=base_dir+'/features/all_features.feature')
    print '\n======%s======'%attribute
    for f in labeled_features:
        print f
        if f in d1:
            #print d1[f]
            print '%0.2f , %0.2f'%(1.*d1[f][0]/sum(d1[f]),1.*d1[f][1]/sum(d1[f]))
        if f in d2:
            #print d2[f]
            print '%0.2f , %0.2f'%(1.*d2[f][0]/sum(d2[f]),1.*d2[f][1]/sum(d2[f]))
Exemple #21
0
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    labels_distribute=[0.,0.]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        if len(features)<10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label]+=1#features[f]
        labels_distribute[label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0]/=labels_distribute[0]
        distribute[f][1]/=labels_distribute[1]
    for f in distribute:
        s=sum(distribute[f])
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
Exemple #22
0
def statistics(labels,
               feature_file_name,
               threshold,
               collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention = max(labels.values()) + 1
    label_distribute = Counter(labels.values())
    label_distribute = [
        label_distribute[i] if i in label_distribute else 0
        for i in xrange(label_dimention)
    ]
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    feature_distribute = dict([f, [0.] * label_dimention]
                              for f in all_features)
    for index, user in enumerate(collection.find()):
        try:
            label = labels[user['_id']]
        except:
            continue
        features = combine_dict(user['mentions'], Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label] += 1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        if s == 0 or s < threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= label_distribute[i]

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= s
    score = dict()
    for f, v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f] = abs_score(v)
    return score, feature_distribute
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention=max(labels.values())+1
    label_distribute=Counter(labels.values())
    label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    feature_distribute=dict([f,[0.]*label_dimention] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        features=combine_dict(user['mentions'],Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label]+=1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        if s==0 or s<threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=label_distribute[i]

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=s
    score=dict()
    for f,v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f]=abs_score(v)
    return score,feature_distribute
Exemple #24
0
def construct_train_set(labeled_features, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter(user['products'])).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    print len(confidence0), len(confidence1)

    if len(confidence0) == 0 or len(confidence1) == 0:
        return False
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        labeled_train_data.write('%d %s\n' % (d[1], d[3]))

    unlabeled_train_data = StringIO.StringIO()
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        unlabeled_train_data.write('%d %s\n' % (d[1], d[3]))
    return True
def construct_train_set(attribute, training_count):
    product_features = get_features(feature_file=base_dir +
                                    '/features/product.feature')
    mention_features = get_features(feature_file=base_dir +
                                    '/features/mention.feature',
                                    existent_features=product_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=mention_features)
    mention_features_1 = get_features(feature_file=base_dir +
                                      '/features/mention_1.feature',
                                      existent_features=review_featuers)
    mention_features_2 = get_features(feature_file=base_dir +
                                      '/features/mention_2.feature',
                                      existent_features=mention_features_1)
    test_uids = get_test_uids()

    labeled_feature_file = '%s/review_constraint_%s.constraints' % (
        labeled_feature_file_dir, attribute)
    label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    guess = []
    for index, user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features = combine_dict(user['mentions_0'], Counter('products'))
        label, confidence = label_arbiter.arbitrate_label(features)
        x = []

        #user['products']=[]
        for f, v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f], v))

        #user['mentions_0']={}
        for f, v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f], v))

        #user['review']=[]
        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f], v))

        user['mentions_1'] = {}
        for f, v in user['mentions_1'].items():
            f = f + '_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f], v))

        user['mentions_2'] = {}
        for f, v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f], v))

        x = sorted(x, key=lambda d: d[0])
        str_x = ' '.join(map(lambda f: '%s:%f' % f, x))
        guess.append((
            user['_id'],
            label,
            abs(confidence),
            str_x,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    data0 = filter(lambda d: d[1] == 0, guess)
    data0 = sorted(data0, key=lambda d: d[2], reverse=True)
    data1 = filter(lambda d: d[1] == 1, guess)
    data1 = sorted(data1, key=lambda d: d[2], reverse=True)
    data2 = filter(lambda d: d[1] == -1, guess)
    data2 = sorted(data2, key=lambda d: d[4], reverse=True)

    dimention = min(len(data0), len(data1), training_count / 2)

    data0 = data0[:dimention]
    data1 = data1[:dimention]
    data2 = data2[:dimention]

    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute,
        'w')
    for d in data0 + data1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in data2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
Exemple #26
0
        i = 0
        for line in f:
            line_split = line.strip().split(" ")
            if(len(line_split) < 2):
                print "broken line "+line
                continue

            if(label_map_index.has_key(line_split[-1])):
                label_images[label_map_index[line_split[-1]] ].append(line_split[0])
            else:
                label_map_index[line_split[-1]] = i
                i = i + 1
                label_images.append([line_split[0]])


    images_feature = get_features(label_images,deploy,model,use_gpu)

    #求出平均
    images_label_ave = []
    for features in images_feature:
        images_label_ave.append(np.mean(np.array(features),0))

    numerator = []
    denominator = []
    reslut = []
    sort_index = []

    for i in xrange(len(images_label_ave)):
        #计算均值 排序处理
        numerator.append(np.dot(images_feature[i],images_label_ave[i][:]))
        denominator.append(np.linalg.norm(images_feature[i],axis=1))
def construct_train_set(labeled_features,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users

    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        label_distributed=[1,1]
        for f,value in combine_features(user['mentions'],Counter(user['products'])).items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        if not s==0:
            label_distributed[0]/=s
            label_distributed[1]/=s
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature))
        confidence.append(
                (user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    confidence0=filter(lambda d:d[1]==0,confidence)
    confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True)
    confidence1=filter(lambda d:d[1]==1,confidence)
    confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True)
    confidence2=filter(lambda d:d[1]==-1,confidence)
    confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True)

    dimention=min(len(confidence0),len(confidence1),training_count/2)
    confidence0=confidence0[:dimention]
    confidence1=confidence1[:dimention]
    confidence2=confidence2[:dimention]

    print len(confidence0),len(confidence1)

    if len(confidence0)==0 or len(confidence1)==0:
        return False
    labeled_train_data=open(RAW_DATA_DIR+'multi_clf/labeled_train.data','w')
    for d in confidence0+confidence1:
        labeled_train_data.write('%d %s\n'%(d[1],d[3]))

    unlabeled_train_data=StringIO.StringIO()
    labeled_train_data=open(RAW_DATA_DIR+'multi_clf/unlabeled_train.data','w')
    for d in confidence0+confidence1:
        unlabeled_train_data.write('%d %s\n'%(d[1],d[3]))
    return True
def construct_train_set(attribute,training_count):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)
    test_uids=get_test_uids()

    labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)
    label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    guess=[]
    for index,user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features=combine_dict(user['mentions_0'],Counter('products'))
        label,confidence=label_arbiter.arbitrate_label(features)
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))
        guess.append(
                (user['_id'],
                    label,
                    abs(confidence),
                    str_x,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    data0=filter(lambda d:d[1]==0,guess)
    data0=sorted(data0,key=lambda d:d[2],reverse=True)
    data1=filter(lambda d:d[1]==1,guess)
    data1=sorted(data1,key=lambda d:d[2],reverse=True)
    data2=filter(lambda d:d[1]==-1,guess)
    data2=sorted(data2,key=lambda d:d[4],reverse=True)

    dimention=min(len(data0),len(data1),training_count/2)

    data0=data0[:dimention]
    data1=data1[:dimention]
    data2=data2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in data0+data1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in data2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
def construct_test_set(attribute):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)

    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print 'Balance params: ',balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        #if random.random()>balance_params[label]:
        #    continue

        '============'
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))

        fout.write('%d %s\n'%(label,str_x))
        uid_output.write('%s\n'%(user['_id']))
        bar.draw(index+1)
def construct_train_set(attribute,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    labeled_features=dict()
    for line in labeled_feature_file:
        line=line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:])

    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        label_distributed=[1,1]
        for f,value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        label_distributed[0]/=s
        label_distributed[1]/=s
        #print label_distributed
        #if abs(label_distributed[0]-label_distributed[1])<0.5:
        #    continue
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%d'%f,sorted_feature))
        confidence.append((
                    user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features
                    ))
        bar.draw(index+1)

    confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0=filter(lambda d:d[1]==0,confidence)[:training_count/2]
    confidence1=filter(lambda d:d[1]==1,confidence)[:training_count/2]
    confidence_unlabel=[]
    confidence_unlabel+=filter(lambda d:d[1]==-1,confidence)
    #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5]
    #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5]
    confidence_unlabel=confidence_unlabel[:5*training_count]
    print len(confidence0),len(confidence1)
    fout=open(self_training_file_dir+'labeled_train_%s.data'%attribute,'w')
    for d in set(confidence0+confidence1):
        fout.write('%d %s\n'%(d[1],d[3]))
    fout_unlabel=open(self_training_file_dir+'unlabeled_train_%s.data'%attribute,'w')
    for d in confidence_unlabel:
        fout_unlabel.write('%d %s\n'%(d[1],d[3]))
Exemple #31
0
def statistics_after_train(attribute,
                           method,
                           threshold=-1,
                           feature_file_name=base_dir +
                           '/features/mention.feature',
                           show=False):
    import random
    labels = get_labels_after_train(attribute, method)
    print len(labels)
    collection = Connection().jd.train_users
    label_distribute = Counter(labels.values())
    balance_params = dict()
    for label in label_distribute:
        balance_params[label] = 1.0 * min(
            label_distribute.values()) / label_distribute[label]
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    for index, user in enumerate(collection.find()):
        try:
            label = labels[user['_id']]
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        for f in features:
            if f in distribute:
                distribute[f][label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print label_distribute
    for f in distribute:
        distribute[f][0] /= label_distribute[0]
        distribute[f][1] /= label_distribute[1]
    for f in distribute.keys():
        s = sum(distribute[f])
        if s == 0:
            distribute.pop(f)
            continue
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute)
    distribute = sorted(distribute.items(),
                        key=lambda d: max(d[1]) / sum(d[1]),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features()
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users
    labeled_feature_distribute = dict()
    for f in labeled_features:
        labeled_feature_distribute[f] = 0
    for user in collection.find():
        for f in user['mentions_1']:
            if f in labeled_features:
                labeled_feature_distribute[f] += user['mentions_1'][f]
    s = 1. * sum(labeled_feature_distribute.values())
    for f in labeled_features:
        labeled_feature_distribute[f] /= s

    for f in labeled_features:
        print f
        print labeled_feature_distribute[f]
    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        '''
        归一化
        '''
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        s = sum(features.values())
        if s < 10:
            continue
        label_distributed = [1, 1]
        #for f,value in features.items():
        '''
        使用传播后的mention
        '''
        for f, value in user['mentions_1'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][
                    0] * value / labeled_feature_distribute[f]
                label_distributed[1] *= labeled_features[f][
                    1] * value / labeled_feature_distribute[f]
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        print ''
        for f in labeled_features:
            print f
            if f in user['mentions']:
                print user['mentions'][f]
            else:
                print 0
        for f in labeled_features:
            print f
            if f in user['mentions_1']:
                print user['mentions_1'][f]
            else:
                print 0

        #if abs(label_distributed[0]-label_distributed[1])<0.5:
        #    continue
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    confidence = sorted(confidence, key=lambda d: d[2], reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2]
    confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2]
    print len(confidence0), len(confidence1)
    fout = open(RAW_DATA_DIR + 'prlabel2trainset/%s_train.data' % attribute,
                'w')
    uid_output = open(
        RAW_DATA_DIR + 'prlabel2trainset/%s_train_uids.data' % attribute, 'w')
    for d in confidence0 + confidence1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
def construct_train_set(attribute,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    labeled_features=dict()
    for line in labeled_feature_file:
        line=line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:])
    collection=Connection().jd.train_users

    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        label_distributed=[1,1]
        for f,value in combine_features(user['mentions'],Counter('products')).items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        if not s==0:
            label_distributed[0]/=s
            label_distributed[1]/=s
        label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products')))
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature))
        confidence.append(
                (user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    confidence0=filter(lambda d:d[1]==0,confidence)
    confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True)
    confidence1=filter(lambda d:d[1]==1,confidence)
    confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True)
    confidence2=filter(lambda d:d[1]==-1,confidence)
    confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True)

    dimention=min(len(confidence0),len(confidence1),training_count/2)
    confidence0=confidence0[:dimention]
    confidence1=confidence1[:dimention]
    confidence2=confidence2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in confidence0+confidence1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in confidence2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
Exemple #34
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])

    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        #print label_distributed
        #if abs(label_distributed[0]-label_distributed[1])<0.5:
        #    continue
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    confidence = sorted(confidence, key=lambda d: d[2], reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2]
    confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2]
    confidence_unlabel = []
    confidence_unlabel += filter(lambda d: d[1] == -1, confidence)
    #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5]
    #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5]
    confidence_unlabel = confidence_unlabel[:5 * training_count]
    print len(confidence0), len(confidence1)
    fout = open(self_training_file_dir + 'labeled_train_%s.data' % attribute,
                'w')
    for d in set(confidence0 + confidence1):
        fout.write('%d %s\n' % (d[1], d[3]))
    fout_unlabel = open(
        self_training_file_dir + 'unlabeled_train_%s.data' % attribute, 'w')
    for d in confidence_unlabel:
        fout_unlabel.write('%d %s\n' % (d[1], d[3]))
def construct_test_set(attribute):
    product_features = get_features(feature_file=base_dir +
                                    '/features/product.feature')
    mention_features = get_features(feature_file=base_dir +
                                    '/features/mention.feature',
                                    existent_features=product_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=mention_features)
    mention_features_1 = get_features(feature_file=base_dir +
                                      '/features/mention_1.feature',
                                      existent_features=review_featuers)
    mention_features_2 = get_features(feature_file=base_dir +
                                      '/features/mention_2.feature',
                                      existent_features=mention_features_1)

    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print 'Balance params: ', balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        #if random.random()>balance_params[label]:
        #    continue

        '============'
        x = []

        #user['products']=[]
        for f, v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f], v))

        #user['mentions_0']={}
        for f, v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f], v))

        #user['review']=[]
        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f], v))

        user['mentions_1'] = {}
        for f, v in user['mentions_1'].items():
            f = f + '_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f], v))

        user['mentions_2'] = {}
        for f, v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f], v))

        x = sorted(x, key=lambda d: d[0])
        str_x = ' '.join(map(lambda f: '%s:%f' % f, x))

        fout.write('%d %s\n' % (label, str_x))
        uid_output.write('%s\n' % (user['_id']))
        bar.draw(index + 1)
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    review_featuers = get_features(
        feature_file_name=base_dir + '/features/review.feature',
        start_index=max(all_features_1.values()) + 1)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter('products')).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    fout = open(RAW_DATA_DIR + 'mylabel2trainset/%s_train.data' % attribute,
                'w')
    uid_output = open(
        RAW_DATA_DIR + 'mylabel2trainset/%s_train_uids.data' % attribute, 'w')
    for d in confidence0 + confidence1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel.data' % attribute,
        'w')
    uid_output = open(
        RAW_DATA_DIR +
        'mylabel2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in confidence2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])