def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'): vectors = dict() fin = open(file_name) line = fin.readline().strip().split(' ') count, dimention = int(line[0]), int(line[1]) bar = progress_bar(count) for index in xrange(count): line = fin.readline() line = line.strip().split(' ') vector = map(lambda d: float(d), line[1:]) vectors[line[0]] = vector bar.draw(index + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: vectors[user['_id']] = [0.] * dimention continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(self_training_file_dir+'test_%s.data'%attribute,'w') for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) if len(sorted_feature)==0: continue fout.write('%d'%label) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_data(attribute): collection=Connection().jd.test_users all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) data=[] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): uid=user['_id'] features=combine_features(user['mentions'],Counter(user['products'])) try: y=user['profile'][attribute].index(1) except: continue x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v data.append([uid,y,x]) bar.draw(index+1) #data=balance(data,target_index=1) output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w') for index,user in enumerate(collection.find()): label=0 fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) features=combine_features(user['mentions_1'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def validate_knn(attribute): print attribute features, labels = get_features_and_labels(attribute) neibor_count = 100 f = open('./%s_knn_result.data' % attribute, 'w') bar = progress_bar(len(features)) for index, uid1 in enumerate(features): distance = [] for uid2 in features: if uid2 == uid1: continue d = get_distance(features[uid1], features[uid2]) distance.append((uid2, d)) distance = sorted(distance, key=lambda d: d[1], reverse=True)[:neibor_count] label = labels[uid1] plabel = numpy.zeros((len(label)), dtype='float32') for d in distance: #plabel+=d[1]*labels[d[0]] plabel += labels[d[0]] if sum(plabel) == 0: continue plabel /= sum(plabel) if label[0] > 0: f.write('%d %f %f\n' % (0, plabel[0], plabel[1])) else: f.write('%d %f %f\n' % (1, plabel[0], plabel[1])) bar.draw(index + 1)
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) #confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence1 = filter(lambda d: d[1] == 1, confidence) #dimention=min(len(confidence0),len(confidence1),training_count/2) #confidence0=confidence0#[:dimention] #confidence1=confidence1#[:dimention] print len(confidence0), len(confidence1) fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w') #for d in confidence0+confidence1: for d in confidence: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_test_set(attribute): all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def validate_knn(attribute): print attribute features,labels=get_features_and_labels(attribute) neibor_count=100 f=open('./%s_knn_result.data'%attribute,'w') bar=progress_bar(len(features)) for index,uid1 in enumerate(features): distance=[] for uid2 in features: if uid2==uid1: continue d=get_distance(features[uid1],features[uid2]) distance.append((uid2,d)) distance=sorted(distance,key=lambda d:d[1],reverse=True)[:neibor_count] label=labels[uid1] plabel=numpy.zeros((len(label)),dtype='float32') for d in distance: #plabel+=d[1]*labels[d[0]] plabel+=labels[d[0]] if sum(plabel)==0: continue plabel/=sum(plabel) if label[0]>0: f.write('%d %f %f\n'%(0,plabel[0],plabel[1])) else: f.write('%d %f %f\n'%(1,plabel[0],plabel[1])) bar.draw(index+1)
def age_distribute(): from small_utils.progress_bar import progress_bar from pymongo import Connection from collections import Counter collection=Connection().jd.test_users weibo_collection=Connection().jd.weibo_users linked_jd_ids=dict() ages=[] for line in open('/mnt/data1/adoni/data/linked_uids.data'): linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): if sum(user['profile']['age'])==0: continue weibo_id=linked_jd_ids[user['_id']] weibo_user=weibo_collection.find_one({'_id':weibo_id}) if weibo_user==None: continue age=2015-int(weibo_user['birthday'].split(u'年')[0]) if age>50 or age<10: continue ages.append(age) if age<30: user['profile']['age']=[1,0] else: user['profile']['age']=[0,1] collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}}) bar.draw(index) s=sum(Counter(ages).values()) ages=sorted(Counter(ages).items(),key=lambda d:d[0]) ss=0. for age in ages: ss+=age[1] print age[0],(ss)/s
def construct_test_data(attribute): collection = Connection().jd.test_users all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) data = [] bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] features = combine_features(user['mentions'], Counter(user['products'])) try: y = user['profile'][attribute].index(1) except: continue x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v data.append([uid, y, x]) bar.draw(index + 1) #data=balance(data,target_index=1) output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_train_data(): import random all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) data = [] uids = get_test_uids() for index, user in enumerate(collection.find()): uid = user['_id'] if uid in uids: continue features = combine_features(user['mentions'], Counter(user['products'])) x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v y = random.randint(0, 1) data.append([uid, y, x]) bar.draw(index + 1) output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
def construct_train_data(): import random all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) data=[] uids=get_test_uids() for index,user in enumerate(collection.find()): uid=user['_id'] if uid in uids: continue features=combine_features(user['mentions'],Counter(user['products'])) x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v y=random.randint(0,1) data.append([uid,y,x]) bar.draw(index+1) output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) features = combine_features(user['mentions_1'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def train(self,data): f_u=dict() f_v=dict() C_u=numpy.max(numpy.sum(data.f_u_init.values(),axis=1)) C_v=numpy.max(numpy.sum(data.f_v_init.values(),axis=1)) for u in data.U: f_u[u]=data.f_u_init[u] for v in data.V: f_v[v]=data.f_v_init[v] bar=progress_bar(self.max_iter) t1=datetime.datetime.now() for index in xrange(self.max_iter): for u in data.U: n_u=numpy.zeros((data.vector_size)) for v in data.U[u]: n_u+=f_v[v] alpha=numpy.sum(data.f_u_init[u])/C_u f_u[u]=alpha*data.f_u_init[u]+(1.-alpha)*n_u for v in data.V: n_v=numpy.zeros((data.vector_size)) for u in data.V[v]: n_v+=f_u[u] alpha=numpy.sum(data.f_v_init[v])/C_v f_v[v]=alpha*data.f_v_init[v]+(1.-alpha)*n_v t2=datetime.datetime.now() print 'Iter: %d, minutes: %d'%(index,(t2-t1).seconds/60) t1=t2 #bar.draw(index+1) return f_u,f_v
def prp_product(u_collection,p_collection,iterate): print 'prp product' pids=get_id(p_collection,iterate) print len(pids) bar=progress_bar(len(pids)) for index,pid in enumerate(pids): prp_single_product(u_collection,p_collection,pid,iterate) bar.draw(index+1)
def prp_user(u_collection, p_collection, iterate): print 'prp user' uids = get_id(u_collection, iterate) print len(uids) bar = progress_bar(len(uids)) for index, uid in enumerate(uids): prp_single_user(u_collection, p_collection, uid, iterate) bar.draw(index + 1)
def output_user_product_graph(): fout = open(RAW_DATA_DIR + 'graph.data', 'w') collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1)
def prp_product(u_collection, p_collection, iterate): print 'prp product' pids = get_id(p_collection, iterate) print len(pids) bar = progress_bar(len(pids)) for index, pid in enumerate(pids): prp_single_product(u_collection, p_collection, pid, iterate) bar.draw(index + 1)
def prp_user(u_collection,p_collection,iterate): print 'prp user' uids=get_id(u_collection,iterate) print len(uids) bar=progress_bar(len(uids)) for index,uid in enumerate(uids): prp_single_user(u_collection,p_collection,uid,iterate) bar.draw(index+1)
def get_train_user_products(): collection=Connection().jd.train_users bar=progress_bar(collection.count()) user_products=dict() for index,user in enumerate(collection.find()): user_products[user['_id']]=dict(Counter(user['products'])) #user_products[user['_id']]=user['mentions'] bar.draw(index) return user_products
def construct_test_set(attribute): all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def get_train_uids(): collection = Connection().jd.train_users uids = set() for user in collection.find(): uids.add(user['_id']) collection = Connection().jd.train_users bar = progress_bar(len(uids)) for index, uid in enumerate(uids): collection.delete_one({'_id': uid}) bar.draw(index + 1)
def get_train_uids(): collection=Connection().jd.train_users uids=set() for user in collection.find(): uids.add(user['_id']) collection=Connection().jd.train_users bar=progress_bar(len(uids)) for index,uid in enumerate(uids): collection.delete_one({'_id':uid}) bar.draw(index+1)
def statistics(attribute, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) labels_distribute = [0., 0.] for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] if len(features) < 10: continue for f in features: if f in distribute: distribute[f][label] += 1 #features[f] labels_distribute[label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0] /= labels_distribute[0] distribute[f][1] /= labels_distribute[1] for f in distribute: s = sum(distribute[f]) distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute = sorted(distribute.items(), key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) / (sum(d[1]) + 0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def construct_test_user(): all_products = get_all_ids_from_file('product') collection = Connection().jd.test_users collection.drop() linked_users = Connection().jd.weibo_users fname = RAW_DATA_DIR + 'test_user_review.data' uids_with_kids = [ line[:-1] for line in open(RAW_DATA_DIR + 'uids_with_kids.data') ] uids_without_kids = [ line[:-1] for line in open(RAW_DATA_DIR + 'uids_without_kids.data') ] linked_uids = dict([(line[:-1].split(' ')[1], line[:-1].split(' ')[0]) for line in open(RAW_DATA_DIR + 'linked_uids.data')]) prone_words = ['宝宝', '女儿', '儿子', '男朋友', '女朋友'] f = open(fname) count = int(f.readline()[:-1]) bar = progress_bar(count) for i in xrange(count): uid = f.readline()[:-1] products = f.readline()[:-1].split(' ') products = list(set(products) & all_products) mentions = count_mentions(f.readline()) profile = { 'gender': [0] * 2, 'age': [0] * 2, 'location': [0] * 2, 'kids': [0] * 2, } if uid in linked_uids: user = linked_users.find_one({'_id': linked_uids[uid]}) if user == None: pass else: profile['gender'] = user['profile']['gender'] profile['age'] = user['profile']['age'] profile['location'] = user['profile']['location'] if uid in uids_with_kids: profile['kids'] = [0, 1] if uid in uids_without_kids: profile['kids'] = [1, 0] if uid in uids_without_kids or uid in uids_with_kids: for w in prone_words: if w in mentions: mentions.pop(w) collection.insert({ '_id': uid, 'products': products, 'mentions': mentions, 'profile': profile }) bar.draw(i + 1)
def construct_graph(fname, uids): print '===========' print fname print len(uids) bar = progress_bar(len(uids)) fout = open(fname, 'w') index = 0 for line in open('./remap_weibo_graph.data'): uid = line[0:line.find(' ')] if uid in uids: fout.write(line) index += 1 bar.draw(index)
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def output_vector(entity_name): collection = Connection().jd[entity_name] bar = progress_bar(collection.count()) mentions = get_mentions() fout = open(RAW_DATA_DIR + '%s_init_vec.data' % entity_name, 'w') fout.write('%d %d\n' % (collection.count(), len(mentions))) for index, entity in enumerate(collection.find()): reviews = ' '.join(set(map(lambda r: r[1], entity['records']))) vector = map(lambda m: reviews.count(m), mentions) if numpy.any(vector): fout.write( '%s %s\n' % (entity['_id'], ' '.join(map(lambda d: str(d), vector)))) bar.draw(index + 1)
def output_features(fname, key): fout = open(fname, 'w') collection = Connection().jd.train_users bar = progress_bar(collection.count()) features = [] for index, user in enumerate(collection.find()): features += user[key] bar.draw(index + 1) features = sorted(Counter(features).items(), key=lambda d: d[1], reverse=True) fout = open('./features/review.feature', 'w') for f in features: fout.write('%s %d\n' % (f[0].encode('utf8'), f[1]))
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random labels=get_labels_after_train(attribute,method) print len(labels) collection=Connection().jd.train_users label_distribute=Counter(labels.values()) balance_params=dict() for label in label_distribute: balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] for f in features: if f in distribute: distribute[f][label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print label_distribute for f in distribute: distribute[f][0]/=label_distribute[0] distribute[f][1]/=label_distribute[1] for f in distribute.keys(): s=sum(distribute[f]) if s==0: distribute.pop(f) continue distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute) distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def construct_train_product(): all_users = get_all_ids_from_file('user') | get_all_ids_from_file( 'test_user') collection = Connection().jd.train_products fname = RAW_DATA_DIR + 'product_review.data' f = open(fname) count = int(f.readline()[:-1]) bar = progress_bar(count) for i in xrange(count): pid = f.readline()[:-1] users = f.readline()[:-1].split(' ') users = list(set(users) & all_users) mentions = count_mentions(f.readline()) collection.insert({'_id': pid, 'users': users, 'mentions': mentions}) bar.draw(i + 1)
def construct_test_set(attribute): all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) if len(sorted_feature)==0: continue fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) if len(sorted_feature)==0: continue fout.write('%d'%label) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def get_a_random_path_from_graph(graph, length): from small_utils.progress_bar import progress_bar path = [] nid = weighted_random_select(graph.nodes_weight) path.append(nid) bar = progress_bar(length - 1) for i in xrange(length - 1): node = graph[nid] while node is None: nid = weighted_random_select(graph.nodes_weight) node = graph[nid] nid = weighted_random_select(node) path.append(nid) bar.draw(i) return path
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) labels_distribute=[0.,0.] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] if len(features)<10: continue for f in features: if f in distribute: distribute[f][label]+=1#features[f] labels_distribute[label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0]/=labels_distribute[0] distribute[f][1]/=labels_distribute[1] for f in distribute: s=sum(distribute[f]) distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def test(attribute): from pymongo import Connection collection=Connection().jd.test_users bar=progress_bar(collection.count()) labels=dict() for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue labels[user['_id']]=label bar.draw(index+1) if index>100000: break score,feature_distribute=statistics(labels,feature_file_name=base_dir+'/features/mention.feature',threshold=20) for f,v in sorted(score.items(),key=lambda d:d[1],reverse=True)[:50]: print f,'0:%0.2f 1:%0.2f'%tuple(feature_distribute[f]) print feature_distribute[u'同学']
def statistics(labels, feature_file_name, threshold, collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention = max(labels.values()) + 1 label_distribute = Counter(labels.values()) label_distribute = [ label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention) ] all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) feature_distribute = dict([f, [0.] * label_dimention] for f in all_features) for index, user in enumerate(collection.find()): try: label = labels[user['_id']] except: continue features = combine_dict(user['mentions'], Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label] += 1.0 bar.draw(index) for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) if s == 0 or s < threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i] /= label_distribute[i] for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i] /= s score = dict() for f, v in feature_distribute.items(): #score[f]=eta_score(v) score[f] = abs_score(v) return score, feature_distribute
def construct_train_user(): from pyltp import Segmentor all_products = get_all_ids_from_file('product') collection = Connection().jd.train_users fname = RAW_DATA_DIR + 'user_review.data' f = open(fname) count = int(f.readline()[:-1]) print count bar = progress_bar(count) for i in xrange(count): uid = f.readline()[:-1] products = f.readline()[:-1].split(' ') products = list(set(products) & all_products) mentions = count_mentions(f.readline()) collection.insert({ '_id': uid, 'products': products, 'mentions': mentions }) bar.draw(i + 1)
def test(attribute): from pymongo import Connection collection = Connection().jd.test_users bar = progress_bar(collection.count()) labels = dict() for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue labels[user['_id']] = label bar.draw(index + 1) if index > 100000: break score, feature_distribute = statistics(labels, feature_file_name=base_dir + '/features/mention.feature', threshold=20) for f, v in sorted(score.items(), key=lambda d: d[1], reverse=True)[:50]: print f, '0:%0.2f 1:%0.2f' % tuple(feature_distribute[f]) print feature_distribute[u'同学']
def analyze_feature_count(attribute): from small_utils.progress_bar import progress_bar from pymongo import Connection from collections import Counter collection=Connection().jd.test_users bar=progress_bar(collection.count()) x=[] y=[] labels=[] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue labels.append(label) x.append(len(user['products'])) y.append(len(user['mentions'].values())) bar.draw(index) f=open('./tmp.data','w') for i in xrange(len(labels)): f.write('%d %d %d\n'%(labels[i],x[i],y[i])) print Counter(labels)
def insert_review(collection, fname): from collections import Counter from pyltp import Segmentor f = open(fname) count = int(f.readline()[:-1]) print count segmentor = Segmentor() segmentor.load('/home/adoni/cws.model') bar = progress_bar(count) review_count = 0 for i in xrange(count): uid = f.readline()[:-1] products = f.readline() review = f.readline()[:-1].replace('|&|', ' ') review_count += len(review) continue review = [w for w in segmentor.segment(review)] collection.update({'_id': uid}, {'$set': { 'review': review }}, safe=True) bar.draw(i + 1) print review_count
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention=max(labels.values())+1 label_distribute=Counter(labels.values()) label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) feature_distribute=dict([f,[0.]*label_dimention] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue features=combine_dict(user['mentions'],Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label]+=1.0 bar.draw(index) for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) if s==0 or s<threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i]/=label_distribute[i] for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i]/=s score=dict() for f,v in feature_distribute.items(): #score[f]=eta_score(v) score[f]=abs_score(v) return score,feature_distribute
def remove_surrounding_nodes(fname): print 'Start' uids = [line.split(' ')[0] for line in open(fname)] uids = set(uids) if '/' in fname: out_file_name = '/'.join( fname.split( '/' )[:-1] ) + '/cleaned_' + fname.split('/')[-1] else: out_file_name = 'cleaned_' + fname fout = open(out_file_name, 'w') bar = progress_bar(len(uids)) for index, line in enumerate(open(fname)): #bar.draw(index+1) if index % 10000 == 0: bar.draw(index + 1) line = line.strip().split(' ') line = filter(lambda uid: uid in uids, line) if len(line) <= 1: continue fout.write(' '.join(line) + '\n')
def construct_train_set(labeled_features, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter(user['products'])).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] print len(confidence0), len(confidence1) if len(confidence0) == 0 or len(confidence1) == 0: return False labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data', 'w') for d in confidence0 + confidence1: labeled_train_data.write('%d %s\n' % (d[1], d[3])) unlabeled_train_data = StringIO.StringIO() labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data', 'w') for d in confidence0 + confidence1: unlabeled_train_data.write('%d %s\n' % (d[1], d[3])) return True
def construct_train_set(labeled_features,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): label_distributed=[1,1] for f,value in combine_features(user['mentions'],Counter(user['products'])).items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) if not s==0: label_distributed[0]/=s label_distributed[1]/=s if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index+1) confidence0=filter(lambda d:d[1]==0,confidence) confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True) confidence1=filter(lambda d:d[1]==1,confidence) confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True) confidence2=filter(lambda d:d[1]==-1,confidence) confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True) dimention=min(len(confidence0),len(confidence1),training_count/2) confidence0=confidence0[:dimention] confidence1=confidence1[:dimention] confidence2=confidence2[:dimention] print len(confidence0),len(confidence1) if len(confidence0)==0 or len(confidence1)==0: return False labeled_train_data=open(RAW_DATA_DIR+'multi_clf/labeled_train.data','w') for d in confidence0+confidence1: labeled_train_data.write('%d %s\n'%(d[1],d[3])) unlabeled_train_data=StringIO.StringIO() labeled_train_data=open(RAW_DATA_DIR+'multi_clf/unlabeled_train.data','w') for d in confidence0+confidence1: unlabeled_train_data.write('%d %s\n'%(d[1],d[3])) return True
def construct_test_set(attribute): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print 'Balance params: ',balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue #if random.random()>balance_params[label]: # continue '============' x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) fout.write('%d %s\n'%(label,str_x)) uid_output.write('%s\n'%(user['_id'])) bar.draw(index+1)
def construct_train_set(attribute,training_count): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) test_uids=get_test_uids() labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute) label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file) collection=Connection().jd.train_users bar=progress_bar(collection.count()) guess=[] for index,user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features=combine_dict(user['mentions_0'],Counter('products')) label,confidence=label_arbiter.arbitrate_label(features) x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) guess.append( (user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index+1) data0=filter(lambda d:d[1]==0,guess) data0=sorted(data0,key=lambda d:d[2],reverse=True) data1=filter(lambda d:d[1]==1,guess) data1=sorted(data1,key=lambda d:d[2],reverse=True) data2=filter(lambda d:d[1]==-1,guess) data2=sorted(data2,key=lambda d:d[4],reverse=True) dimention=min(len(data0),len(data1),training_count/2) data0=data0[:dimention] data1=data1[:dimention] data2=data2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in data0+data1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in data2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) labeled_features=dict() for line in labeled_feature_file: line=line[:-1].split(' ') labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:]) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): label_distributed=[1,1] for f,value in combine_features(user['mentions'],Counter('products')).items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) if not s==0: label_distributed[0]/=s label_distributed[1]/=s label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products'))) if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index+1) confidence0=filter(lambda d:d[1]==0,confidence) confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True) confidence1=filter(lambda d:d[1]==1,confidence) confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True) confidence2=filter(lambda d:d[1]==-1,confidence) confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True) dimention=min(len(confidence0),len(confidence1),training_count/2) confidence0=confidence0[:dimention] confidence1=confidence1[:dimention] confidence2=confidence2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in confidence0+confidence1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in confidence2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute,training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)) labeled_features=dict() for line in labeled_feature_file: line=line[:-1].split(' ') labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:]) collection=Connection().jd.train_users bar=progress_bar(collection.count()) confidence=[] for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] label_distributed=[1,1] for f,value in user['mentions'].items(): if f in labeled_features: label_distributed[0]*=labeled_features[f][0]*value label_distributed[1]*=labeled_features[f][1]*value s=1.0*sum(label_distributed) label_distributed[0]/=s label_distributed[1]/=s #print label_distributed #if abs(label_distributed[0]-label_distributed[1])<0.5: # continue if label_distributed[0]>label_distributed[1]: label=0 elif label_distributed[0]<label_distributed[1]: label=1 else: label=-1 sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) str_features=' '.join(map(lambda f:'%s:%d'%f,sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0]-label_distributed[1]), str_features )) bar.draw(index+1) confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0=filter(lambda d:d[1]==0,confidence)[:training_count/2] confidence1=filter(lambda d:d[1]==1,confidence)[:training_count/2] confidence_unlabel=[] confidence_unlabel+=filter(lambda d:d[1]==-1,confidence) #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5] #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5] confidence_unlabel=confidence_unlabel[:5*training_count] print len(confidence0),len(confidence1) fout=open(self_training_file_dir+'labeled_train_%s.data'%attribute,'w') for d in set(confidence0+confidence1): fout.write('%d %s\n'%(d[1],d[3])) fout_unlabel=open(self_training_file_dir+'unlabeled_train_%s.data'%attribute,'w') for d in confidence_unlabel: fout_unlabel.write('%d %s\n'%(d[1],d[3]))