def construct_train_set(attribute,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) labeled_features=dict() for line in labeled_feature_file: line=line[:-1].split(' ') labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:]) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): label_distributed=[1,1] for f,value in combine_features(user['mentions'],Counter('products')).items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) if not s==0: label_distributed[0]/=s label_distributed[1]/=s label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products'))) if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index+1) confidence0=filter(lambda d:d[1]==0,confidence) confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True) confidence1=filter(lambda d:d[1]==1,confidence) confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True) confidence2=filter(lambda d:d[1]==-1,confidence) confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True) dimention=min(len(confidence0),len(confidence1),training_count/2) confidence0=confidence0[:dimention] confidence1=confidence1[:dimention] confidence2=confidence2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in confidence0+confidence1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in confidence2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute,training_count): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) test_uids=get_test_uids() labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute) label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file) collection=Connection().jd.train_users bar=progress_bar(collection.count()) guess=[] for index,user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features=combine_dict(user['mentions_0'],Counter('products')) label,confidence=label_arbiter.arbitrate_label(features) x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) guess.append( (user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index+1) data0=filter(lambda d:d[1]==0,guess) data0=sorted(data0,key=lambda d:d[2],reverse=True) data1=filter(lambda d:d[1]==1,guess) data1=sorted(data1,key=lambda d:d[2],reverse=True) data2=filter(lambda d:d[1]==-1,guess) data2=sorted(data2,key=lambda d:d[4],reverse=True) dimention=min(len(data0),len(data1),training_count/2) data0=data0[:dimention] data1=data1[:dimention] data2=data2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in data0+data1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in data2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) label_arbiter = LabelArbiter( labeled_feature_file='%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter('products')).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s label_distributed = label_arbiter.get_label_distribute( combine_features(user['mentions'], Counter('products'))) if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute, 'w') for d in confidence0 + confidence1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in confidence2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_train_set(attribute, training_count): product_features = get_features(feature_file=base_dir + '/features/product.feature') mention_features = get_features(feature_file=base_dir + '/features/mention.feature', existent_features=product_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=mention_features) mention_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=review_featuers) mention_features_2 = get_features(feature_file=base_dir + '/features/mention_2.feature', existent_features=mention_features_1) test_uids = get_test_uids() labeled_feature_file = '%s/review_constraint_%s.constraints' % ( labeled_feature_file_dir, attribute) label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file) collection = Connection().jd.train_users bar = progress_bar(collection.count()) guess = [] for index, user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features = combine_dict(user['mentions_0'], Counter('products')) label, confidence = label_arbiter.arbitrate_label(features) x = [] #user['products']=[] for f, v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f], v)) #user['mentions_0']={} for f, v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f], v)) #user['review']=[] for f, v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f], v)) user['mentions_1'] = {} for f, v in user['mentions_1'].items(): f = f + '_1' if f not in mention_features_1: continue x.append((mention_features_1[f], v)) user['mentions_2'] = {} for f, v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f], v)) x = sorted(x, key=lambda d: d[0]) str_x = ' '.join(map(lambda f: '%s:%f' % f, x)) guess.append(( user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index + 1) data0 = filter(lambda d: d[1] == 0, guess) data0 = sorted(data0, key=lambda d: d[2], reverse=True) data1 = filter(lambda d: d[1] == 1, guess) data1 = sorted(data1, key=lambda d: d[2], reverse=True) data2 = filter(lambda d: d[1] == -1, guess) data2 = sorted(data2, key=lambda d: d[4], reverse=True) dimention = min(len(data0), len(data1), training_count / 2) data0 = data0[:dimention] data1 = data1[:dimention] data2 = data2[:dimention] fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute, 'w') for d in data0 + data1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in data2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])