def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) features = combine_features(user['mentions_1'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_train_data(): import random all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) data = [] uids = get_test_uids() for index, user in enumerate(collection.find()): uid = user['_id'] if uid in uids: continue features = combine_features(user['mentions'], Counter(user['products'])) x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v y = random.randint(0, 1) data.append([uid, y, x]) bar.draw(index + 1) output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
def construct_test_data(attribute): collection = Connection().jd.test_users all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) data = [] bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] features = combine_features(user['mentions'], Counter(user['products'])) try: y = user['profile'][attribute].index(1) except: continue x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v data.append([uid, y, x]) bar.draw(index + 1) #data=balance(data,target_index=1) output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w') for index,user in enumerate(collection.find()): label=0 fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) features=combine_features(user['mentions_1'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def construct_train_data(): import random all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) data=[] uids=get_test_uids() for index,user in enumerate(collection.find()): uid=user['_id'] if uid in uids: continue features=combine_features(user['mentions'],Counter(user['products'])) x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v y=random.randint(0,1) data.append([uid,y,x]) bar.draw(index+1) output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def construct_test_data(attribute): collection=Connection().jd.test_users all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) data=[] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): uid=user['_id'] features=combine_features(user['mentions'],Counter(user['products'])) try: y=user['profile'][attribute].index(1) except: continue x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v data.append([uid,y,x]) bar.draw(index+1) #data=balance(data,target_index=1) output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_test_set(attribute): all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) if len(sorted_feature)==0: continue fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) if len(sorted_feature)==0: continue fout.write('%d'%label) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) #confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence1 = filter(lambda d: d[1] == 1, confidence) #dimention=min(len(confidence0),len(confidence1),training_count/2) #confidence0=confidence0#[:dimention] #confidence1=confidence1#[:dimention] print len(confidence0), len(confidence1) fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w') #for d in confidence0+confidence1: for d in confidence: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def compair_single(attribute, method): d1 = statistics(attribute, threshold=50, feature_file_name=base_dir + '/features/all_features.feature') d2 = statistics_after_train(attribute, method, feature_file_name=base_dir + '/features/all_features.feature') result = [] labeled_features = [ line.split(' ')[0].decode('utf8') for line in open(labeled_feature_file_dir + 'review_constraint_%s.constraints' % attribute) ] all_features = get_features(feature_file_name=base_dir + '/features/all_features.feature') print '\n======%s======' % attribute for f in labeled_features: print f if f in d1: #print d1[f] print '%0.2f , %0.2f' % (1. * d1[f][0] / sum(d1[f]), 1. * d1[f][1] / sum(d1[f])) if f in d2: #print d2[f] print '%0.2f , %0.2f' % (1. * d2[f][0] / sum(d2[f]), 1. * d2[f][1] / sum(d2[f]))
def construct_test_set(attribute): all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(self_training_file_dir+'test_%s.data'%attribute,'w') for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) if len(sorted_feature)==0: continue fout.write('%d'%label) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def statistics(attribute, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) labels_distribute = [0., 0.] for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] if len(features) < 10: continue for f in features: if f in distribute: distribute[f][label] += 1 #features[f] labels_distribute[label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0] /= labels_distribute[0] distribute[f][1] /= labels_distribute[1] for f in distribute: s = sum(distribute[f]) distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute = sorted(distribute.items(), key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) / (sum(d[1]) + 0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def feature_based(args, valid=False): img_l = tools.load_image(args.left_image, 0) img_r = tools.load_image(args.right_image, 0) if valid: img_l, img_r = img_r, img_l feat_l = tools.get_features(img_l) feat_r = tools.get_features(img_r) disparity = np.zeros(img_l.shape).astype(np.int) disparity = functions.GetDisparity_feature(img_l, img_r, args.kernel_size, feat_l, feat_r, args.measure, disparity, valid=valid) disparity = cv2.filter2D(disparity.astype(np.float32), -1, tools.mean_kernel2D) print return disparity.astype(np.int)
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random labels=get_labels_after_train(attribute,method) print len(labels) collection=Connection().jd.train_users label_distribute=Counter(labels.values()) balance_params=dict() for label in label_distribute: balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] for f in features: if f in distribute: distribute[f][label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print label_distribute for f in distribute: distribute[f][0]/=label_distribute[0] distribute[f][1]/=label_distribute[1] for f in distribute.keys(): s=sum(distribute[f]) if s==0: distribute.pop(f) continue distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute) distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def compair_single(attribute,method): d1=statistics(attribute,threshold=50,feature_file_name=base_dir+'/features/all_features.feature') d2=statistics_after_train(attribute,method,feature_file_name=base_dir+'/features/all_features.feature') result=[] labeled_features=[line.split(' ')[0].decode('utf8') for line in open(labeled_feature_file_dir+'review_constraint_%s.constraints'%attribute)] all_features=get_features(feature_file_name=base_dir+'/features/all_features.feature') print '\n======%s======'%attribute for f in labeled_features: print f if f in d1: #print d1[f] print '%0.2f , %0.2f'%(1.*d1[f][0]/sum(d1[f]),1.*d1[f][1]/sum(d1[f])) if f in d2: #print d2[f] print '%0.2f , %0.2f'%(1.*d2[f][0]/sum(d2[f]),1.*d2[f][1]/sum(d2[f]))
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) labels_distribute=[0.,0.] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] if len(features)<10: continue for f in features: if f in distribute: distribute[f][label]+=1#features[f] labels_distribute[label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0]/=labels_distribute[0] distribute[f][1]/=labels_distribute[1] for f in distribute: s=sum(distribute[f]) distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def statistics(labels, feature_file_name, threshold, collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention = max(labels.values()) + 1 label_distribute = Counter(labels.values()) label_distribute = [ label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention) ] all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) feature_distribute = dict([f, [0.] * label_dimention] for f in all_features) for index, user in enumerate(collection.find()): try: label = labels[user['_id']] except: continue features = combine_dict(user['mentions'], Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label] += 1.0 bar.draw(index) for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) if s == 0 or s < threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i] /= label_distribute[i] for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i] /= s score = dict() for f, v in feature_distribute.items(): #score[f]=eta_score(v) score[f] = abs_score(v) return score, feature_distribute
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention=max(labels.values())+1 label_distribute=Counter(labels.values()) label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) feature_distribute=dict([f,[0.]*label_dimention] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue features=combine_dict(user['mentions'],Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label]+=1.0 bar.draw(index) for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) if s==0 or s<threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i]/=label_distribute[i] for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i]/=s score=dict() for f,v in feature_distribute.items(): #score[f]=eta_score(v) score[f]=abs_score(v) return score,feature_distribute
def construct_train_set(labeled_features, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter(user['products'])).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] print len(confidence0), len(confidence1) if len(confidence0) == 0 or len(confidence1) == 0: return False labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data', 'w') for d in confidence0 + confidence1: labeled_train_data.write('%d %s\n' % (d[1], d[3])) unlabeled_train_data = StringIO.StringIO() labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data', 'w') for d in confidence0 + confidence1: unlabeled_train_data.write('%d %s\n' % (d[1], d[3])) return True
def construct_train_set(attribute, training_count): product_features = get_features(feature_file=base_dir + '/features/product.feature') mention_features = get_features(feature_file=base_dir + '/features/mention.feature', existent_features=product_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=mention_features) mention_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=review_featuers) mention_features_2 = get_features(feature_file=base_dir + '/features/mention_2.feature', existent_features=mention_features_1) test_uids = get_test_uids() labeled_feature_file = '%s/review_constraint_%s.constraints' % ( labeled_feature_file_dir, attribute) label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file) collection = Connection().jd.train_users bar = progress_bar(collection.count()) guess = [] for index, user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features = combine_dict(user['mentions_0'], Counter('products')) label, confidence = label_arbiter.arbitrate_label(features) x = [] #user['products']=[] for f, v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f], v)) #user['mentions_0']={} for f, v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f], v)) #user['review']=[] for f, v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f], v)) user['mentions_1'] = {} for f, v in user['mentions_1'].items(): f = f + '_1' if f not in mention_features_1: continue x.append((mention_features_1[f], v)) user['mentions_2'] = {} for f, v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f], v)) x = sorted(x, key=lambda d: d[0]) str_x = ' '.join(map(lambda f: '%s:%f' % f, x)) guess.append(( user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index + 1) data0 = filter(lambda d: d[1] == 0, guess) data0 = sorted(data0, key=lambda d: d[2], reverse=True) data1 = filter(lambda d: d[1] == 1, guess) data1 = sorted(data1, key=lambda d: d[2], reverse=True) data2 = filter(lambda d: d[1] == -1, guess) data2 = sorted(data2, key=lambda d: d[4], reverse=True) dimention = min(len(data0), len(data1), training_count / 2) data0 = data0[:dimention] data1 = data1[:dimention] data2 = data2[:dimention] fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute, 'w') for d in data0 + data1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in data2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
i = 0 for line in f: line_split = line.strip().split(" ") if(len(line_split) < 2): print "broken line "+line continue if(label_map_index.has_key(line_split[-1])): label_images[label_map_index[line_split[-1]] ].append(line_split[0]) else: label_map_index[line_split[-1]] = i i = i + 1 label_images.append([line_split[0]]) images_feature = get_features(label_images,deploy,model,use_gpu) #求出平均 images_label_ave = [] for features in images_feature: images_label_ave.append(np.mean(np.array(features),0)) numerator = [] denominator = [] reslut = [] sort_index = [] for i in xrange(len(images_label_ave)): #计算均值 排序处理 numerator.append(np.dot(images_feature[i],images_label_ave[i][:])) denominator.append(np.linalg.norm(images_feature[i],axis=1))
def construct_train_set(labeled_features,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): label_distributed=[1,1] for f,value in combine_features(user['mentions'],Counter(user['products'])).items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) if not s==0: label_distributed[0]/=s label_distributed[1]/=s if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index+1) confidence0=filter(lambda d:d[1]==0,confidence) confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True) confidence1=filter(lambda d:d[1]==1,confidence) confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True) confidence2=filter(lambda d:d[1]==-1,confidence) confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True) dimention=min(len(confidence0),len(confidence1),training_count/2) confidence0=confidence0[:dimention] confidence1=confidence1[:dimention] confidence2=confidence2[:dimention] print len(confidence0),len(confidence1) if len(confidence0)==0 or len(confidence1)==0: return False labeled_train_data=open(RAW_DATA_DIR+'multi_clf/labeled_train.data','w') for d in confidence0+confidence1: labeled_train_data.write('%d %s\n'%(d[1],d[3])) unlabeled_train_data=StringIO.StringIO() labeled_train_data=open(RAW_DATA_DIR+'multi_clf/unlabeled_train.data','w') for d in confidence0+confidence1: unlabeled_train_data.write('%d %s\n'%(d[1],d[3])) return True
def construct_train_set(attribute,training_count): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) test_uids=get_test_uids() labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute) label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file) collection=Connection().jd.train_users bar=progress_bar(collection.count()) guess=[] for index,user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features=combine_dict(user['mentions_0'],Counter('products')) label,confidence=label_arbiter.arbitrate_label(features) x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) guess.append( (user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index+1) data0=filter(lambda d:d[1]==0,guess) data0=sorted(data0,key=lambda d:d[2],reverse=True) data1=filter(lambda d:d[1]==1,guess) data1=sorted(data1,key=lambda d:d[2],reverse=True) data2=filter(lambda d:d[1]==-1,guess) data2=sorted(data2,key=lambda d:d[4],reverse=True) dimention=min(len(data0),len(data1),training_count/2) data0=data0[:dimention] data1=data1[:dimention] data2=data2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in data0+data1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in data2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_test_set(attribute): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print 'Balance params: ',balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue #if random.random()>balance_params[label]: # continue '============' x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) fout.write('%d %s\n'%(label,str_x)) uid_output.write('%s\n'%(user['_id'])) bar.draw(index+1)
def construct_train_set(attribute,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) labeled_features=dict() for line in labeled_feature_file: line=line[:-1].split(' ') labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:]) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] label_distributed=[1,1] for f,value in user['mentions'].items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) label_distributed[0]/=s label_distributed[1]/=s #print label_distributed #if abs(label_distributed[0]-label_distributed[1])<0.5: # continue if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%d'%f,sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features )) bar.draw(index+1) confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0=filter(lambda d:d[1]==0,confidence)[:training_count/2] confidence1=filter(lambda d:d[1]==1,confidence)[:training_count/2] confidence_unlabel=[] confidence_unlabel+=filter(lambda d:d[1]==-1,confidence) #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5] #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5] confidence_unlabel=confidence_unlabel[:5*training_count] print len(confidence0),len(confidence1) fout=open(self_training_file_dir+'labeled_train_%s.data'%attribute,'w') for d in set(confidence0+confidence1): fout.write('%d %s\n'%(d[1],d[3])) fout_unlabel=open(self_training_file_dir+'unlabeled_train_%s.data'%attribute,'w') for d in confidence_unlabel: fout_unlabel.write('%d %s\n'%(d[1],d[3]))
def statistics_after_train(attribute, method, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random labels = get_labels_after_train(attribute, method) print len(labels) collection = Connection().jd.train_users label_distribute = Counter(labels.values()) balance_params = dict() for label in label_distribute: balance_params[label] = 1.0 * min( label_distribute.values()) / label_distribute[label] all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) for index, user in enumerate(collection.find()): try: label = labels[user['_id']] except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] for f in features: if f in distribute: distribute[f][label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print label_distribute for f in distribute: distribute[f][0] /= label_distribute[0] distribute[f][1] /= label_distribute[1] for f in distribute.keys(): s = sum(distribute[f]) if s == 0: distribute.pop(f) continue distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute) distribute = sorted(distribute.items(), key=lambda d: max(d[1]) / sum(d[1]), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features() labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users labeled_feature_distribute = dict() for f in labeled_features: labeled_feature_distribute[f] = 0 for user in collection.find(): for f in user['mentions_1']: if f in labeled_features: labeled_feature_distribute[f] += user['mentions_1'][f] s = 1. * sum(labeled_feature_distribute.values()) for f in labeled_features: labeled_feature_distribute[f] /= s for f in labeled_features: print f print labeled_feature_distribute[f] bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) ''' 归一化 ''' for m in user['mentions']: features[m] = user['mentions'][m] s = sum(features.values()) if s < 10: continue label_distributed = [1, 1] #for f,value in features.items(): ''' 使用传播后的mention ''' for f, value in user['mentions_1'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][ 0] * value / labeled_feature_distribute[f] label_distributed[1] *= labeled_features[f][ 1] * value / labeled_feature_distribute[f] s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s print '' for f in labeled_features: print f if f in user['mentions']: print user['mentions'][f] else: print 0 for f in labeled_features: print f if f in user['mentions_1']: print user['mentions_1'][f] else: print 0 #if abs(label_distributed[0]-label_distributed[1])<0.5: # continue if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) confidence = sorted(confidence, key=lambda d: d[2], reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2] confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2] print len(confidence0), len(confidence1) fout = open(RAW_DATA_DIR + 'prlabel2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'prlabel2trainset/%s_train_uids.data' % attribute, 'w') for d in confidence0 + confidence1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_train_set(attribute,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) labeled_features=dict() for line in labeled_feature_file: line=line[:-1].split(' ') labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:]) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): label_distributed=[1,1] for f,value in combine_features(user['mentions'],Counter('products')).items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) if not s==0: label_distributed[0]/=s label_distributed[1]/=s label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products'))) if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index+1) confidence0=filter(lambda d:d[1]==0,confidence) confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True) confidence1=filter(lambda d:d[1]==1,confidence) confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True) confidence2=filter(lambda d:d[1]==-1,confidence) confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True) dimention=min(len(confidence0),len(confidence1),training_count/2) confidence0=confidence0[:dimention] confidence1=confidence1[:dimention] confidence2=confidence2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in confidence0+confidence1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in confidence2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s #print label_distributed #if abs(label_distributed[0]-label_distributed[1])<0.5: # continue if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) confidence = sorted(confidence, key=lambda d: d[2], reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2] confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2] confidence_unlabel = [] confidence_unlabel += filter(lambda d: d[1] == -1, confidence) #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5] #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5] confidence_unlabel = confidence_unlabel[:5 * training_count] print len(confidence0), len(confidence1) fout = open(self_training_file_dir + 'labeled_train_%s.data' % attribute, 'w') for d in set(confidence0 + confidence1): fout.write('%d %s\n' % (d[1], d[3])) fout_unlabel = open( self_training_file_dir + 'unlabeled_train_%s.data' % attribute, 'w') for d in confidence_unlabel: fout_unlabel.write('%d %s\n' % (d[1], d[3]))
def construct_test_set(attribute): product_features = get_features(feature_file=base_dir + '/features/product.feature') mention_features = get_features(feature_file=base_dir + '/features/mention.feature', existent_features=product_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=mention_features) mention_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=review_featuers) mention_features_2 = get_features(feature_file=base_dir + '/features/mention_2.feature', existent_features=mention_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print 'Balance params: ', balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue #if random.random()>balance_params[label]: # continue '============' x = [] #user['products']=[] for f, v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f], v)) #user['mentions_0']={} for f, v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f], v)) #user['review']=[] for f, v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f], v)) user['mentions_1'] = {} for f, v in user['mentions_1'].items(): f = f + '_1' if f not in mention_features_1: continue x.append((mention_features_1[f], v)) user['mentions_2'] = {} for f, v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f], v)) x = sorted(x, key=lambda d: d[0]) str_x = ' '.join(map(lambda f: '%s:%f' % f, x)) fout.write('%d %s\n' % (label, str_x)) uid_output.write('%s\n' % (user['_id'])) bar.draw(index + 1)
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) review_featuers = get_features( feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features_1.values()) + 1) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter('products')).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] fout = open(RAW_DATA_DIR + 'mylabel2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_uids.data' % attribute, 'w') for d in confidence0 + confidence1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in confidence2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])