def output_graph_matrix(): from pymongo import Connection users = Connection().user_profilling.users graph = Connection().user_profilling.graph_embedding print graph.count() bar = get_progressive_bar(users.count()) x = [] y = [] finish_count = 0 uids = [] for user in users.find({'int_id': { '$exists': True }}, { 'information': 1, 'int_id': 1 }): finish_count += 1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding = graph.find_one({'_id': user['int_id']}) if user_embedding is None: print user_embedding continue gender = user['information']['gender'] if gender == 'f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x, y, uids, 'user_graph_vector.data')
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'): vectors = dict() fin = open(file_name) line = fin.readline().strip().split(' ') count, dimention = int(line[0]), int(line[1]) bar = progress_bar(count) for index in xrange(count): line = fin.readline() line = line.strip().split(' ') vector = map(lambda d: float(d), line[1:]) vectors[line[0]] = vector bar.draw(index + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: vectors[user['_id']] = [0.] * dimention continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1)
def output_graph_matrix(): from pymongo import Connection users=Connection().user_profilling.users graph=Connection().user_profilling.graph_embedding print graph.count() bar=get_progressive_bar(users.count()) x=[] y=[] finish_count=0 uids=[] for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}): finish_count+=1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding=graph.find_one({'_id':user['int_id']}) if user_embedding is None: print user_embedding continue gender=user['information']['gender'] if gender=='f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x,y,uids,'user_graph_vector.data')
class AllTest(unittest2.TestCase): def setUp(self): self.col0 = Connection("127.0.0.1", 27017)["algolab-test"]["rg0"] self.col1 = Connection("127.0.0.1", 27017)["algolab-test"]["rg1"] self.col2 = Connection("127.0.0.1", 27017)["algolab-test"]["rg2"] self.col0.drop() self.col1.drop() self.col2.drop() create_rg(npoints[2], self.col0, distance_function=edist) create_rg(npoints[5], self.col0, distance_function=edist) create_rg(npoints[2], self.col1, distance_function=edist) create_rg(npoints[5], self.col1, distance_function=edist) create_rg(npoints[3], self.col1, distance_function=edist) create_rg(npoints[4], self.col1, distance_function=edist) def test_rdp(self): segments = S(self.col0).segments for seg in segments: sloc = locs_for(seg, self.col0) create_rg(rdp(sloc, 0), self.col2) self.assertEqual(self.col2.count(), 8) def test_rdp2(self): segments = S(self.col1).segments for seg in segments: sloc = locs_for(seg, self.col1) create_rg(rdp(sloc, 0), self.col2) self.assertEqual(self.col2.count(), 11) def test_rdp3(self): segments = S(self.col1).segments for seg in segments: sloc = locs_for(seg, self.col1) create_rg(rdp(sloc, 100000), self.col2) self.assertEqual(self.col2.count(), 8) def test_anglered(self): segments = S(self.col1).segments for seg in segments: sloc = locs_for(seg, self.col1) create_rg(anglereduce(sloc, 1), self.col2) self.assertEqual(self.col2.count(), 8) def test_anglered2(self): segments = S(self.col1).segments for seg in segments: sloc = locs_for(seg, self.col1) create_rg(anglereduce(sloc, 180), self.col2) self.assertEqual(self.col2.count(), 11)
def output_vector(entity_name): collection = Connection().jd[entity_name] bar = progress_bar(collection.count()) mentions = get_mentions() fout = open(RAW_DATA_DIR + '%s_init_vec.data' % entity_name, 'w') fout.write('%d %d\n' % (collection.count(), len(mentions))) for index, entity in enumerate(collection.find()): reviews = ' '.join(set(map(lambda r: r[1], entity['records']))) vector = map(lambda m: reviews.count(m), mentions) if numpy.any(vector): fout.write( '%s %s\n' % (entity['_id'], ' '.join(map(lambda d: str(d), vector)))) bar.draw(index + 1)
def generate_name_feature(): from pymongo import Connection lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 y = [] for user in users.find(): name = user['screen_name'] normal_name = '' for n in name: if n[0] in lastnames: normal_name = n else: continue if normal_name == '': continue if len(normal_name) < 2: continue corpus.append(normal_name[1:]) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) feature_selection_df(corpus)
def output_user_user_propagate_vectors(order): from pymongo import Connection all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users vectors=load_user_user_graph_propagate_vector(order) total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): try: vector=vectors[int(user['_id'])] except: continue if not vector.any(): continue #y=get_location_class(user['location'],key_map) all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_user_user_propagate'+str(order)) return return dump_train_valid_test(all_x, all_y, 'jd_user_user_propagate')
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 y = [] for user in users.find(): if 'descriptions' not in user['information']: continue description = user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x = x.toarray() all_data_y = numpy.array(y) dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def output_goods_class_matrix(order=0): from pymongo import Connection feature_map={} f=open('./features/item_class_order_%d.feature'%order).readlines() tmp_feature=[] for index,line in enumerate(f): tmp_feature.append(line.decode('utf8').split(' ')[0]) for index,f in enumerate(tmp_feature): feature_map[f]=index all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] behaviors=user['behaviors'] for behavior in behaviors: feature=behavior['item_class'][order-1] features.append(feature) vector=get_one_hot_vector(features,feature_map) if not vector.any(): continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_item_class_order_'+str(order)) return
def output_shopping_tf_matrix(feature_length=3): from pymongo import Connection all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] count_male=0 for user in users.find(): vector=numpy.zeros((feature_length)) tf=dict() for behavior in user['behaviors']: try: tf[behavior['timestamp']]+=1 except: tf[behavior['timestamp']]=1 if len(tf)<feature_length: continue tf=sorted(tf.iteritems(), key=lambda d:d[1], reverse=True) for i in range(0,feature_length): vector[i]=tf[i][1] all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) all_y=numpy.array(all_y) return dump_train_valid_test(all_x, all_y, 'jd_user_simple')
def output_sentence_embedding_matrix(file_name1,file_name2): from pymongo import Connection all_x=[] index=0 embedding=doc2vec_embedding(file_name1) #embedding=load_doc2vec_embedding(file_name1) #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] count_male=0 for user in users.find(): try: vector=embedding['USER_%d'%user['jd_id']] except: continue #if y==-1: # continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) #return dump_train_valid_test(all_x, all_y, 'jd_user_embedding') #dump_user_vector(all_x, all_y, uids, 'jd_user_embedding_with_item_class') dump_user_vector(all_x, uids, file_name2)
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_set(attribute): all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) #confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence1 = filter(lambda d: d[1] == 1, confidence) #dimention=min(len(confidence0),len(confidence1),training_count/2) #confidence0=confidence0#[:dimention] #confidence1=confidence1#[:dimention] print len(confidence0), len(confidence1) fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w') #for d in confidence0+confidence1: for d in confidence: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_test_data(attribute): collection = Connection().jd.test_users all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) data = [] bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] features = combine_features(user['mentions'], Counter(user['products'])) try: y = user['profile'][attribute].index(1) except: continue x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v data.append([uid, y, x]) bar.draw(index + 1) #data=balance(data,target_index=1) output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w') for index,user in enumerate(collection.find()): label=0 fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) features=combine_features(user['mentions_1'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def construct_test_data(attribute): collection=Connection().jd.test_users all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) data=[] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): uid=user['_id'] features=combine_features(user['mentions'],Counter(user['products'])) try: y=user['profile'][attribute].index(1) except: continue x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v data.append([uid,y,x]) bar.draw(index+1) #data=balance(data,target_index=1) output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_train_data(): import random all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) data = [] uids = get_test_uids() for index, user in enumerate(collection.find()): uid = user['_id'] if uid in uids: continue features = combine_features(user['mentions'], Counter(user['products'])) x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v y = random.randint(0, 1) data.append([uid, y, x]) bar.draw(index + 1) output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
def construct_train_data(): import random all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) data=[] uids=get_test_uids() for index,user in enumerate(collection.find()): uid=user['_id'] if uid in uids: continue features=combine_features(user['mentions'],Counter(user['products'])) x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v y=random.randint(0,1) data.append([uid,y,x]) bar.draw(index+1) output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def construct_test_set(attribute): all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(self_training_file_dir+'test_%s.data'%attribute,'w') for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) if len(sorted_feature)==0: continue fout.write('%d'%label) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users=Connection().user_profilling.users bar=get_progressive_bar(users.count()) corpus=[] finish_count=0 y=[] for user in users.find(): if 'descriptions' not in user['information']: continue description=user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender']=='m': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x=x.toarray() all_data_y=numpy.array(y) dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) features = combine_features(user['mentions_1'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=dict() for user in users.find(): name=user['screen_name'] finish_count+=1 for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<2: continue if user['information']['gender']=='m': gender=1 else: gender=0 for w in n[1:]: if w not in tf: tf[w]=[0,0] tf[w][gender]+=1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def output_review_star_matrix(feature_length=1000): from pymongo import Connection feature_map={} for i in range(0,6): feature_map[str(i)]=i all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] for behavior in user['behaviors']: feature=str(behavior['review']['review_stars']) features.append(feature) if features==[]: continue vector=get_one_hot_vector(features, feature_map) if not vector.any(): continue #y=get_location_class(user['location'],key_map) all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_review_star') return return dump_train_valid_test(all_x, all_y, 'jd_review_star')
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=pickle.load(open('./tf.data')) x=[] y=[] for user in users.find(): name=user['screen_name'] finish_count+=1 if finish_count>5000: break for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<3: continue try: x0=1.0*tf[n[1]][0]/sum(tf[n[1]]) x1=1.0*tf[n[2]][0]/sum(tf[n[2]]) except: continue if user['information']['gender']=='m': y.append(1) else: y.append(0) x.append([x0,x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x,y,'gender_name_simple.data')
def output_review_embedding_matrix(): from helper import get_mentions from pymongo import Connection from my_vector_reader import read_vectors all_x=[] #进度条相关参数 users=Connection().jd.weibo_users bar=progress_bar(users.count()) finish_count=0 uids=[] mentions=get_mentions() #review_vocab,review_embedding=read_vectors('/mnt/data1/adoni/jd_data/vectors/word_vectors.data','utf8') review_vocab,review_embedding=read_vectors('../myword2vec/word_vectors.data','utf8') mentions=filter(lambda d:d in review_vocab,mentions) mention_embedding=map(lambda x:review_embedding[review_vocab.index(x)],mentions) vector_size=len(mention_embedding[0]) for user in users.find(): x=numpy.zeros(vector_size) review=' '.join(map(lambda d:d['review']['review_general'],user['behaviors'])) for index,mention in enumerate(mentions): count=review.count(mention) x+=count*mention_embedding[index] if not x.any(): continue all_x.append(x) uids.append(user['_id']) finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'user_review_embedding')
def age_distribute(): from small_utils.progress_bar import progress_bar from pymongo import Connection from collections import Counter collection=Connection().jd.test_users weibo_collection=Connection().jd.weibo_users linked_jd_ids=dict() ages=[] for line in open('/mnt/data1/adoni/data/linked_uids.data'): linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): if sum(user['profile']['age'])==0: continue weibo_id=linked_jd_ids[user['_id']] weibo_user=weibo_collection.find_one({'_id':weibo_id}) if weibo_user==None: continue age=2015-int(weibo_user['birthday'].split(u'年')[0]) if age>50 or age<10: continue ages.append(age) if age<30: user['profile']['age']=[1,0] else: user['profile']['age']=[0,1] collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}}) bar.draw(index) s=sum(Counter(ages).values()) ages=sorted(Counter(ages).items(),key=lambda d:d[0]) ss=0. for age in ages: ss+=age[1] print age[0],(ss)/s
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = dict() for user in users.find(): name = user['screen_name'] finish_count += 1 for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 2: continue if user['information']['gender'] == 'm': gender = 1 else: gender = 0 for w in n[1:]: if w not in tf: tf[w] = [0, 0] tf[w][gender] += 1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def output_simple_matrix(feature_length=10000): from pymongo import Connection from collections import Counter feature_map={} f=open('./features/product.feature').readlines() for i in range(0,len(f)): if feature_length is not None and i>=feature_length: break feature_map[f[i].decode('utf8').split(' ')[0]]=i all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] for behavior in user['behaviors']: feature=str(int(behavior['item'])) features.append(feature) vector=get_one_hot_light_vector(features, feature_map) if len(vector)==0: continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_user_simple',dimention=len(feature_map)) return all_x,uids
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = pickle.load(open('./tf.data')) x = [] y = [] for user in users.find(): name = user['screen_name'] finish_count += 1 if finish_count > 5000: break for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 3: continue try: x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]]) x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]]) except: continue if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x.append([x0, x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x, y, 'gender_name_simple.data')
def construct_mallet_data(profile_key): from pymongo import Connection from my_progress_bar import progress_bar from collections import Counter users=Connection().jd.weibo_users bar=progress_bar(users.count()) fout=open(MATRIXES_DIR+'mallet/construced_data.mallet','w') data=[] for index,user in enumerate(users.find()): try: label=user['profile'][profile_key].index(1) except: continue reviews=[] for behavior in user['behaviors']: #reviews.append('Pro'+str(behavior['item'])) reviews+=behavior['parsed_review']['review_general'] reviews=Counter(reviews) reviews=' '.join(map(lambda word:'%s:%d'%(word,reviews[word]),reviews.keys())) line='%s %d %s\n'%(user['_id'],label,reviews) data.append((label,line)) data=balance(data,target_index=0) #balanced_data=data for label,line in balanced_data: fout.write(line.encode('utf8')) bar.draw(index)
def output_user_product_graph(): fout = open(RAW_DATA_DIR + 'graph.data', 'w') collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1)
def get_train_user_products(): collection=Connection().jd.train_users bar=progress_bar(collection.count()) user_products=dict() for index,user in enumerate(collection.find()): user_products[user['_id']]=dict(Counter(user['products'])) #user_products[user['_id']]=user['mentions'] bar.draw(index) return user_products
def construct_test_set(attribute): all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def statistics(attribute, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) labels_distribute = [0., 0.] for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] if len(features) < 10: continue for f in features: if f in distribute: distribute[f][label] += 1 #features[f] labels_distribute[label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0] /= labels_distribute[0] distribute[f][1] /= labels_distribute[1] for f in distribute: s = sum(distribute[f]) distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute = sorted(distribute.items(), key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) / (sum(d[1]) + 0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def output_mention_and_count(attribute): from pymongo import Connection from my_progress_bar import progress_bar from collections import Counter mentions=get_mentions() mentions=dict(zip(mentions,[[0,0,0] for mention in mentions])) users=Connection().jd.weibo_users bar=progress_bar(users.count()) values=[] for index,user in enumerate(users.find()): bar.draw(index) try: f=user['profile'][attribute].index(1) except: continue values.append(f) values=Counter(values) min_value=min(values.values()) for key in values: values[key]=min_value*1.0/values[key] print values bar=progress_bar(users.count()) for index,user in enumerate(users.find()): try: f=user['profile'][attribute].index(1) except: continue for behavior in user['behaviors']: for w in behavior['parsed_review']['review_general']: if w in mentions: mentions[w][f]+=1 bar.draw(index) mentions=sorted(mentions.items(),key=lambda d:sum(d[1]),reverse=True) print '' for m in mentions: if sum(m[1])<1000: break for i in range(len(m[1])): m[1][i]='%0.1f'%(m[1][i]*values[i]) print m[0].encode('utf8'),m[1]
def output_text_matrix_from_bag_of_words(): from pymongo import Connection words = {} f = open('./word.feature').readlines() for i in range(0, len(f)): words[f[i].decode('utf8')[0:-1]] = i all_data_x = [] all_data_y = [] index = 0 #进度条相关参数 users = Connection().user_profilling.users total_count = users.count() bar = get_progressive_bar(total_count) finish_count = 0 #for line in open('./users.data'): uids = [] for user in users.find(): #user=parse_user(line) correct_status = 0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status += 1 if correct_status < 50: continue length = [] text = numpy.zeros((len(words))) for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: if word not in words: continue text[words[word]] += 1.0 if not text.any(): continue text_vector = text if user['information']['gender'] == 'm': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index += 1 finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x = numpy.array(all_data_x) all_data_y = numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data') dump_user_vector(all_data_x, all_data_y, uids, 'user_text_bag_words.data')
def output_text_matrix_from_bag_of_words(): from pymongo import Connection words={} f=open('./word.feature').readlines() for i in range(0,len(f)): words[f[i].decode('utf8')[0:-1]]=i all_data_x=[] all_data_y=[] index=0 #进度条相关参数 users=Connection().user_profilling.users total_count=users.count() bar=get_progressive_bar(total_count) finish_count=0 #for line in open('./users.data'): uids=[] for user in users.find(): #user=parse_user(line) correct_status=0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status+=1 if correct_status<50: continue length=[] text=numpy.zeros((len(words))) for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: if word not in words: continue text[words[word]]+=1.0 if not text.any(): continue text_vector=text if user['information']['gender']=='m': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index+=1 finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x=numpy.array(all_data_x) all_data_y=numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data') dump_user_vector(all_data_x,all_data_y,uids,'user_text_bag_words.data')
def output_all_shopping_items(): users=Connection().jd.weibo_users all_items=[] bar=progress_bar(users.count()) for index,user in enumerate(users.find()): for behavior in user['behaviors']: all_items.append(behavior['item']) bar.draw(index+1) all_items=Counter(all_items) all_items=sorted(all_items.items(), key=lambda d:d[1], reverse=True) fout=open('./review.feature','w') for word in all_items: fout.write('%d %d\n'%(word[0],word[1]))
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def output_features(fname, key): fout = open(fname, 'w') collection = Connection().jd.train_users bar = progress_bar(collection.count()) features = [] for index, user in enumerate(collection.find()): features += user[key] bar.draw(index + 1) features = sorted(Counter(features).items(), key=lambda d: d[1], reverse=True) fout = open('./features/review.feature', 'w') for f in features: fout.write('%s %d\n' % (f[0].encode('utf8'), f[1]))
def output_review_matrix(order,feature_length=1000): from pymongo import Connection feature_map1={} feature_map2={} index1=0 index2=0 for line in open('./features/review.feature'): line=line.decode('utf8') line=line.replace('\n','').replace('\r','').split(':')[1].split(',') for feature in line: feature_map1[feature]=index1 feature_map2[feature]=index2 index2+=1 index1+=1 print index1 print index2 all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] for behavior in user['behaviors']: review=behavior['review'] review=review['review_title']+review['review_general'] for feature in feature_map1: if feature in review: features.append(feature) if features==[]: continue if order==1: vector=get_one_hot_vector(features, feature_map1) else: vector=get_one_hot_vector(features, feature_map2) #if sum(vector)<10: # continue #if not vector.any(): # continue #y=get_location_class(user['location'],key_map) all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_review%d'%order)
class CombineTest(unittest2.TestCase): def setUp(self): self.col0 = Connection("127.0.0.1", 27017)["algolab-test"]["rg0"] self.col1 = Connection("127.0.0.1", 27017)["algolab-test"]["rg1"] self.col0.drop() self.col1.drop() def create_rg_for(self, datasets, col=None): for n in datasets: create_rg(npoints[n], col if col else self.col0, distance_function=edist) def test_simple_combine(self): self.create_rg_for([12, 13]) anglecombine(self.col0, 20) self.assertEqual(self.col0.count(), 7) self.assertDictContainsSubset({ 0: [1, 1], 1: [2, 1], 2: [3.0, 1.15], 3: [4.0, 1.15], 4: [5.0, 1.15], 5: [6, 1], 6: [7, 1]}, {n["_id"]: n["loc"] for n in self.col0.find()}) def test_combine_2switches(self): self.create_rg_for([12, 13, 15]) anglecombine(self.col0, 20) self.assertEqual(self.col0.count(), 8) self.assertDictContainsSubset({ 0: [1, 1], 1: [2, 1], 2: [3.0, 1.15], 3: [4.0, 1.15], 4: [5.0, 1.15], 5: [6, 1], 12: [4, 4], 6: [7, 1]}, {n["_id"]: n["loc"] for n in self.col0.find()})
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random labels=get_labels_after_train(attribute,method) print len(labels) collection=Connection().jd.train_users label_distribute=Counter(labels.values()) balance_params=dict() for label in label_distribute: balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] for f in features: if f in distribute: distribute[f][label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print label_distribute for f in distribute: distribute[f][0]/=label_distribute[0] distribute[f][1]/=label_distribute[1] for f in distribute.keys(): s=sum(distribute[f]) if s==0: distribute.pop(f) continue distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute) distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def construct_test_set(attribute): all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features={} #features=user['mentions_0'] #features=Counter(user['products']) features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) user['mentions_1_1']={} for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index=max(all_features_1.values())+1 for i,v in enumerate(user['user_product_vector_from_deepwalk']): v=abs(v) sorted_feature.append((i+start_index,v)) if len(sorted_feature)==0: continue fout.write('%d'%label) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features=get_features(feature_file=feature_file_name) all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1) collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w') for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue features=combine_features(user['mentions_0'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) for f,v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f],v)) if len(sorted_feature)==0: continue fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) keys=map(lambda d:d[0], sorted_feature) if not len(keys)==len(set(keys)): print Counter(keys).values() sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%f'%f) fout.write('\n') bar.draw(index+1)
def output_all_item_classes(order): users=Connection().jd.weibo_users all_items=[] bar=progress_bar(users.count()) for index,user in enumerate(users.find()): for behavior in user['behaviors']: try: all_items.append(behavior['item_class'][order]) except: continue bar.draw(index+1) all_items=Counter(all_items) all_items=sorted(all_items.items(), key=lambda d:d[1], reverse=True) fout=open('./item_class_order_%d.feature'%order,'w') for word in all_items: fout.write('%s %d\n'%(word[0].encode('utf8'),word[1]))
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False): import random collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) distribute=dict([f,[0.,0.]] for f in all_features) labels_distribute=[0.,0.] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features=dict(user['mentions']) products=Counter(user['products']) for p in products: features[p]=products[p] if len(features)<10: continue for f in features: if f in distribute: distribute[f][label]+=1#features[f] labels_distribute[label]+=1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f])<threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0]/=labels_distribute[0] distribute[f][1]/=labels_distribute[1] for f in distribute: s=sum(distribute[f]) distribute[f][0]/=s distribute[f][1]/=s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def test(attribute): from pymongo import Connection collection=Connection().jd.test_users bar=progress_bar(collection.count()) labels=dict() for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue labels[user['_id']]=label bar.draw(index+1) if index>100000: break score,feature_distribute=statistics(labels,feature_file_name=base_dir+'/features/mention.feature',threshold=20) for f,v in sorted(score.items(),key=lambda d:d[1],reverse=True)[:50]: print f,'0:%0.2f 1:%0.2f'%tuple(feature_distribute[f]) print feature_distribute[u'同学']
def output_name_matrix(): from sklearn.feature_extraction.text import CountVectorizer lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 3), min_df=1) from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 uids = [] y = [] for user in users.find(): #if finish_count>1000: # break name = user['screen_name'] normal_name = [] for n in name: if n[0] in lastnames: normal_name.append(n[1:]) else: continue #normal_name.append(n) corpus.append(' '.join(normal_name)) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender'] == 'm': y.append(1) else: y.append(0) uids.append(user['information']['uid']) x = vectorizer.fit_transform(corpus) fe = vectorizer.get_feature_names() for f in fe: print f.encode('utf8') all_data_x = x.toarray() all_data_y = numpy.array(y) #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data') dump_user_vector(all_data_x, all_data_y, uids, 'user_name_vector.data')
def plot(): from matplotlib import pyplot as plt x_m = [] y_m = [] x_f = [] y_f = [] from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = pickle.load(open('./tf.data')) for user in users.find(): name = user['screen_name'] finish_count += 1 if finish_count > 5000: break for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 3: continue try: x = 1.0 * tf[n[1]][0] / sum(tf[n[1]]) y = 1.0 * tf[n[2]][0] / sum(tf[n[2]]) except: continue if user['information']['gender'] == 'm': x_m.append(x) y_m.append(y) else: x_f.append(x) y_f.append(y) bar.cursor.restore() bar.draw(value=finish_count) plt.scatter(x_m, y_m, c='red', label='Male', alpha=0.3) plt.scatter(x_f, y_f, c='green', label='Female', alpha=0.3) plt.legend() plt.grid(True) plt.show()
def test(attribute): from pymongo import Connection collection = Connection().jd.test_users bar = progress_bar(collection.count()) labels = dict() for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue labels[user['_id']] = label bar.draw(index + 1) if index > 100000: break score, feature_distribute = statistics(labels, feature_file_name=base_dir + '/features/mention.feature', threshold=20) for f, v in sorted(score.items(), key=lambda d: d[1], reverse=True)[:50]: print f, '0:%0.2f 1:%0.2f' % tuple(feature_distribute[f]) print feature_distribute[u'同学']
def analyze_feature_count(attribute): from small_utils.progress_bar import progress_bar from pymongo import Connection from collections import Counter collection=Connection().jd.test_users bar=progress_bar(collection.count()) x=[] y=[] labels=[] for index,user in enumerate(collection.find()): try: label=user['profile'][attribute].index(1) except: continue labels.append(label) x.append(len(user['products'])) y.append(len(user['mentions'].values())) bar.draw(index) f=open('./tmp.data','w') for i in xrange(len(labels)): f.write('%d %d %d\n'%(labels[i],x[i],y[i])) print Counter(labels)
def construct_train_set(labeled_features, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter(user['products'])).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = {} #features=user['mentions_0'] #features=Counter(user['products']) features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) if 'user_product_vector_from_deepwalk' in user: #if False: start_index = max(all_features_1.values()) + 1 for i, v in enumerate(user['user_product_vector_from_deepwalk']): v = abs(v) sorted_feature.append((i + start_index, v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] print len(confidence0), len(confidence1) if len(confidence0) == 0 or len(confidence1) == 0: return False labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data', 'w') for d in confidence0 + confidence1: labeled_train_data.write('%d %s\n' % (d[1], d[3])) unlabeled_train_data = StringIO.StringIO() labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data', 'w') for d in confidence0 + confidence1: unlabeled_train_data.write('%d %s\n' % (d[1], d[3])) return True
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) review_featuers = get_features( feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features_1.values()) + 1) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in combine_features(user['mentions'], Counter('products')).items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) if not s == 0: label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) user['mentions_1_1'] = {} for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append(( user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features, sum(user['mentions'].values()), )) bar.draw(index + 1) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True) confidence1 = filter(lambda d: d[1] == 1, confidence) confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True) confidence2 = filter(lambda d: d[1] == -1, confidence) confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True) dimention = min(len(confidence0), len(confidence1), training_count / 2) confidence0 = confidence0[:dimention] confidence1 = confidence1[:dimention] confidence2 = confidence2[:dimention] fout = open(RAW_DATA_DIR + 'mylabel2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_uids.data' % attribute, 'w') for d in confidence0 + confidence1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in confidence2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s #print label_distributed #if abs(label_distributed[0]-label_distributed[1])<0.5: # continue if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: label = -1 sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) confidence = sorted(confidence, key=lambda d: d[2], reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2] confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2] confidence_unlabel = [] confidence_unlabel += filter(lambda d: d[1] == -1, confidence) #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5] #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5] confidence_unlabel = confidence_unlabel[:5 * training_count] print len(confidence0), len(confidence1) fout = open(self_training_file_dir + 'labeled_train_%s.data' % attribute, 'w') for d in set(confidence0 + confidence1): fout.write('%d %s\n' % (d[1], d[3])) fout_unlabel = open( self_training_file_dir + 'unlabeled_train_%s.data' % attribute, 'w') for d in confidence_unlabel: fout_unlabel.write('%d %s\n' % (d[1], d[3]))
def statistics_after_train(attribute, method, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random labels = get_labels_after_train(attribute, method) print len(labels) collection = Connection().jd.train_users label_distribute = Counter(labels.values()) balance_params = dict() for label in label_distribute: balance_params[label] = 1.0 * min( label_distribute.values()) / label_distribute[label] all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) for index, user in enumerate(collection.find()): try: label = labels[user['_id']] except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] for f in features: if f in distribute: distribute[f][label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print label_distribute for f in distribute: distribute[f][0] /= label_distribute[0] distribute[f][1] /= label_distribute[1] for f in distribute.keys(): s = sum(distribute[f]) if s == 0: distribute.pop(f) continue distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute) distribute = sorted(distribute.items(), key=lambda d: max(d[1]) / sum(d[1]), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )