def doc2vec_embedding(file_name): import sys import gensim from pymongo import Connection users = Connection().jd.jd_users dimensionality_size = 200 window_size = 8 workers = 5 min_count = 5 # load sentences finish_count = 0 total_count = users.find({'got_review': True}).count() #total_count=users.count() sentences = [] print total_count old_review = '' for user in users.find({'got_review': True}): #for user in users.find(): if finish_count % 10000 == 0: sys.stdout.write("\r%f" % (finish_count * 1.0 / total_count)) sys.stdout.flush() finish_count += 1 content = [] for behavior in user['behaviors']: #content.append(str(behavior['item'])) #content.append(behavior['item_class'][0]) review = ' '.join(behavior['review']['parsed_review_general']) if review == old_review: continue old_review == review content += review.split() for ch in [' ', '\n', '\r', '\u3000']: while 1: try: content.remove(ch) except: break #print ' '.join(content) if len(content) < 10: continue sentence = gensim.models.doc2vec.LabeledSentence( words=content, labels=['USER_%d' % user['_id']]) sentences.append(sentence) print 'load corpus completed...' # train word2vc model = gensim.models.Doc2Vec(sentences, size=200, window=7, workers=20, min_count=3, sample=1e-3) model.save_word2vec_format('/mnt/data1/adoni/jd_data/vectors/' + file_name + '.data', binary=False) print 'embedding done' return model
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'): vectors = dict() fin = open(file_name) line = fin.readline().strip().split(' ') count, dimention = int(line[0]), int(line[1]) bar = progress_bar(count) for index in xrange(count): line = fin.readline() line = line.strip().split(' ') vector = map(lambda d: float(d), line[1:]) vectors[line[0]] = vector bar.draw(index + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: vectors[user['_id']] = [0.] * dimention continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): if user['_id'] not in vectors: continue collection.update( {'_id': user['_id']}, {'$set': { 'user_product_vector_from_line': vectors[user['_id']] }}) bar.draw(index + 1)
class Application(tornado.web.Application): def __init__(self, handlers, **settings): tornado.web.Application.__init__(self, handlers, **settings) self.collection = Connection().vapour.urls self.templates = TemplateLookup(directories=["templates"]) def get_link_by_id(self, id): record = self.collection.find_one({'_id': uuid.UUID(id)}) return fix_id(record) def get_links_by_tag(self, tag): records = self.collection.find({'tags': re.compile(tag, re.I)}) return fix_ids(records) def get_links_by_url(self, url): records = self.collection.find({'url': re.compile(url, re.I)}) return fix_ids(records) def insert_link(self, url, desc, tags): return self.collection.insert({ '_id': uuid.uuid4(), 'url': url, 'desc': desc, 'tags': tags, 'added': datetime.datetime.utcnow() })
class Food(object): def __init__(self): self.db = Connection()["food"]["choices"] def add(self, name): name = str(name).lower() self.db.update({'name':name}, {'name':name}, upsert=True) def remove(self, name): self.db.remove({'name':name}) def get_all(self): retVal = [] try: cur = self.db.find() for i in cur: retVal.append(i['name']) except: pass return retVal def choose(self): try: cur = self.db.find() tmp = [] for i in cur: tmp.append(i['name']) index = random.randrange(len(tmp)) return tmp[index] except: return "unknown"
def doc2vec_embedding(file_name): import sys import gensim from pymongo import Connection users=Connection().jd.jd_users dimensionality_size=200 window_size=8 workers=5 min_count=5 # load sentences finish_count=0 total_count=users.find({'got_review':True}).count() #total_count=users.count() sentences = [] print total_count old_review='' for user in users.find({'got_review':True}): #for user in users.find(): if finish_count%10000==0: sys.stdout.write("\r%f"%(finish_count*1.0/total_count)) sys.stdout.flush() finish_count+=1 content=[] for behavior in user['behaviors']: #content.append(str(behavior['item'])) #content.append(behavior['item_class'][0]) review=' '.join(behavior['review']['parsed_review_general']) if review==old_review: continue old_review==review content+=review.split() for ch in [' ','\n','\r','\u3000']: while 1: try: content.remove(ch) except: break #print ' '.join(content) if len(content)<10: continue sentence = gensim.models.doc2vec.LabeledSentence(words=content,labels=['USER_%d'%user['_id']]) sentences.append(sentence) print 'load corpus completed...' # train word2vc model = gensim.models.Doc2Vec(sentences,size=200,window=7, workers=20,min_count=3,sample=1e-3) model.save_word2vec_format('/mnt/data1/adoni/jd_data/vectors/'+file_name+'.data',binary=False) print 'embedding done' return model
def update_min_max_sum(entity): ''' Used to insert min max and sum ''' collection=Connection().jd['train_%s_mentions'%entity] collection_user=Connection().jd['train_%ss'%entity] mentions=[line[:-1].decode('utf8') for line in open('../features/mention.feature')] min_d=dict() max_d=dict() sum_d=dict() sum_u_d=dict() for m in mentions: min_d[m]=float('inf') max_d[m]=-1 sum_d[m]=0 sum_u_d[m]=0 for user in collection_user.find(): for m in user['mentions']: v=user['mentions'][m] if v<min_d[m]: min_d[m]=v if v>max_d[m]: max_d[m]=v sum_d[m]+=v sum_u_d[m]+=1 for m in mentions: collection.insert({'_id':m,'distribute':[min_d[m],max_d[m],sum_d[m],sum_u_d[m]]})
def generate_name_feature(): from pymongo import Connection lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 y = [] for user in users.find(): name = user['screen_name'] normal_name = '' for n in name: if n[0] in lastnames: normal_name = n else: continue if normal_name == '': continue if len(normal_name) < 2: continue corpus.append(normal_name[1:]) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) feature_selection_df(corpus)
class DaoSunPosition(): def __init__(self): self.col = Connection()['rdam']['sunpos'] self.bulk = [] def create_datetime_index(self): self.col.create_index('datetime') def persist(self, sunpos): self.bulk.append(sunpos) def flush(self): self.col.insert(self.bulk) self.bulk = [] def find_within_time(self, start_date, end_date): result = [] for bson in self.col.find({ '$and': [ { 'datetime' : { '$gt': start_date }}, { 'datetime' : { '$lt': end_date }} ]}): sunpos = SunPosition(bson['az'], bson['el'], bson['datetime']) result.append(sunpos) return result
def construct_train_data(): import random all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) data = [] uids = get_test_uids() for index, user in enumerate(collection.find()): uid = user['_id'] if uid in uids: continue features = combine_features(user['mentions'], Counter(user['products'])) x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v y = random.randint(0, 1) data.append([uid, y, x]) bar.draw(index + 1) output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
def construct_test_data(attribute): collection=Connection().jd.test_users all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) data=[] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): uid=user['_id'] features=combine_features(user['mentions'],Counter(user['products'])) try: y=user['profile'][attribute].index(1) except: continue x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v data.append([uid,y,x]) bar.draw(index+1) #data=balance(data,target_index=1) output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
def construct_test_data(attribute): collection = Connection().jd.test_users all_features = get_features(feature_file_name=feature_file_name) review_features = get_features(feature_file_name=base_dir + '/features/review.feature', start_index=max(all_features.values()) + 1) data = [] bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] features = combine_features(user['mentions'], Counter(user['products'])) try: y = user['profile'][attribute].index(1) except: continue x = dict() for f in features: if f not in all_features: continue x[all_features[f]] = features[f] for f, v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]] = v data.append([uid, y, x]) bar.draw(index + 1) #data=balance(data,target_index=1) output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_train_data(): import random all_features=get_features(feature_file_name=feature_file_name) review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) data=[] uids=get_test_uids() for index,user in enumerate(collection.find()): uid=user['_id'] if uid in uids: continue features=combine_features(user['mentions'],Counter(user['products'])) x=dict() for f in features: if f not in all_features: continue x[all_features[f]]=features[f] for f,v in Counter(user['review']).items(): if f not in review_features: continue x[review_features[f]]=v y=random.randint(0,1) data.append([uid,y,x]) bar.draw(index+1) output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def output_graph_matrix(): from pymongo import Connection users = Connection().user_profilling.users graph = Connection().user_profilling.graph_embedding print graph.count() bar = get_progressive_bar(users.count()) x = [] y = [] finish_count = 0 uids = [] for user in users.find({'int_id': { '$exists': True }}, { 'information': 1, 'int_id': 1 }): finish_count += 1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding = graph.find_one({'_id': user['int_id']}) if user_embedding is None: print user_embedding continue gender = user['information']['gender'] if gender == 'f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x, y, uids, 'user_graph_vector.data')
def update_user_id(): GRAPH_DATA_DIR='/mnt/data1/weibo_graph/' id_map_file=open(GRAPH_DATA_DIR+'id_map.txt') uids=dict() total_count=107628903 finish_count=0 #bar=get_progressive_bar(total_count=total_count) for line in id_map_file: line=line.replace('\n','').split(' ') uids[line[0]]=line[1] finish_count+=1 #bar.cursor.restore() #bar.draw(value=finish_count) #uids=set(uids) from pymongo import Connection users=Connection().user_profilling.users count=0 finish_count=0 u=set() for user in users.find({},{'uid':True}): finish_count+=1 uid=user['uid'] u.add(uid) try: int_id=uids[uid] except Exception as e: continue users.update({'_id':user['_id']},{'$set':{'int_id':int_id}}) #bar.cursor.restore() #bar.draw(value=finish_count) uids=set(uids.keys()) together=uids & u print len(together) print len(uids) print len(u)
def output_review_star_matrix(feature_length=1000): from pymongo import Connection feature_map={} for i in range(0,6): feature_map[str(i)]=i all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] for behavior in user['behaviors']: feature=str(behavior['review']['review_stars']) features.append(feature) if features==[]: continue vector=get_one_hot_vector(features, feature_map) if not vector.any(): continue #y=get_location_class(user['location'],key_map) all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_review_star') return return dump_train_valid_test(all_x, all_y, 'jd_review_star')
def output_user_user_propagate_vectors(order): from pymongo import Connection all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users vectors=load_user_user_graph_propagate_vector(order) total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): try: vector=vectors[int(user['_id'])] except: continue if not vector.any(): continue #y=get_location_class(user['location'],key_map) all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_user_user_propagate'+str(order)) return return dump_train_valid_test(all_x, all_y, 'jd_user_user_propagate')
def output_sentence_embedding_matrix(file_name1,file_name2): from pymongo import Connection all_x=[] index=0 embedding=doc2vec_embedding(file_name1) #embedding=load_doc2vec_embedding(file_name1) #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] count_male=0 for user in users.find(): try: vector=embedding['USER_%d'%user['jd_id']] except: continue #if y==-1: # continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) #return dump_train_valid_test(all_x, all_y, 'jd_user_embedding') #dump_user_vector(all_x, all_y, uids, 'jd_user_embedding_with_item_class') dump_user_vector(all_x, uids, file_name2)
def output_goods_class_matrix(order=0): from pymongo import Connection feature_map={} f=open('./features/item_class_order_%d.feature'%order).readlines() tmp_feature=[] for index,line in enumerate(f): tmp_feature.append(line.decode('utf8').split(' ')[0]) for index,f in enumerate(tmp_feature): feature_map[f]=index all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] behaviors=user['behaviors'] for behavior in behaviors: feature=behavior['item_class'][order-1] features.append(feature) vector=get_one_hot_vector(features,feature_map) if not vector.any(): continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_item_class_order_'+str(order)) return
def output_simple_matrix(feature_length=10000): from pymongo import Connection from collections import Counter feature_map={} f=open('./features/product.feature').readlines() for i in range(0,len(f)): if feature_length is not None and i>=feature_length: break feature_map[f[i].decode('utf8').split(' ')[0]]=i all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] for user in users.find(): features=[] for behavior in user['behaviors']: feature=str(int(behavior['item'])) features.append(feature) vector=get_one_hot_light_vector(features, feature_map) if len(vector)==0: continue all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'jd_user_simple',dimention=len(feature_map)) return all_x,uids
def output_shopping_tf_matrix(feature_length=3): from pymongo import Connection all_x=[] index=0 #进度条相关参数 users=Connection().jd.weibo_users total_count=users.count() bar=progress_bar(total_count) finish_count=0 uids=[] count_male=0 for user in users.find(): vector=numpy.zeros((feature_length)) tf=dict() for behavior in user['behaviors']: try: tf[behavior['timestamp']]+=1 except: tf[behavior['timestamp']]=1 if len(tf)<feature_length: continue tf=sorted(tf.iteritems(), key=lambda d:d[1], reverse=True) for i in range(0,feature_length): vector[i]=tf[i][1] all_x.append(vector) uids.append(user['_id']) index+=1 finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) all_y=numpy.array(all_y) return dump_train_valid_test(all_x, all_y, 'jd_user_simple')
def age_distribute(): from small_utils.progress_bar import progress_bar from pymongo import Connection from collections import Counter collection=Connection().jd.test_users weibo_collection=Connection().jd.weibo_users linked_jd_ids=dict() ages=[] for line in open('/mnt/data1/adoni/data/linked_uids.data'): linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0] bar=progress_bar(collection.count()) for index,user in enumerate(collection.find()): if sum(user['profile']['age'])==0: continue weibo_id=linked_jd_ids[user['_id']] weibo_user=weibo_collection.find_one({'_id':weibo_id}) if weibo_user==None: continue age=2015-int(weibo_user['birthday'].split(u'年')[0]) if age>50 or age<10: continue ages.append(age) if age<30: user['profile']['age']=[1,0] else: user['profile']['age']=[0,1] collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}}) bar.draw(index) s=sum(Counter(ages).values()) ages=sorted(Counter(ages).items(),key=lambda d:d[0]) ss=0. for age in ages: ss+=age[1] print age[0],(ss)/s
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=pickle.load(open('./tf.data')) x=[] y=[] for user in users.find(): name=user['screen_name'] finish_count+=1 if finish_count>5000: break for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<3: continue try: x0=1.0*tf[n[1]][0]/sum(tf[n[1]]) x1=1.0*tf[n[2]][0]/sum(tf[n[2]]) except: continue if user['information']['gender']=='m': y.append(1) else: y.append(0) x.append([x0,x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x,y,'gender_name_simple.data')
def output_graph_matrix(): from pymongo import Connection users=Connection().user_profilling.users graph=Connection().user_profilling.graph_embedding print graph.count() bar=get_progressive_bar(users.count()) x=[] y=[] finish_count=0 uids=[] for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}): finish_count+=1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding=graph.find_one({'_id':user['int_id']}) if user_embedding is None: print user_embedding continue gender=user['information']['gender'] if gender=='f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x,y,uids,'user_graph_vector.data')
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=dict() for user in users.find(): name=user['screen_name'] finish_count+=1 for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<2: continue if user['information']['gender']=='m': gender=1 else: gender=0 for w in n[1:]: if w not in tf: tf[w]=[0,0] tf[w][gender]+=1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def update_all(): tags = Connection()["reddit"]["tags"] index = Connection()["reddit"]["inverted_index"] invalid = ['.', '$'] for tag in tags.find(): for key in tag.keys(): if key != "_id": word_list = tag[key] for w in word_list: for i in invalid: if i in w: w = w.replace(i,'') row = index.find_one({"key" : w}) if not row: index.insert({"key": w, "ids" : [key]}) else: print "Updating", w print row, row["ids"] lst = list(row["ids"]) print lst, key lst.append(key) new_row = {"key":w, "ids": lst} print new_row index.update({"key":w}, new_row)
def output_review_embedding_matrix(): from helper import get_mentions from pymongo import Connection from my_vector_reader import read_vectors all_x=[] #进度条相关参数 users=Connection().jd.weibo_users bar=progress_bar(users.count()) finish_count=0 uids=[] mentions=get_mentions() #review_vocab,review_embedding=read_vectors('/mnt/data1/adoni/jd_data/vectors/word_vectors.data','utf8') review_vocab,review_embedding=read_vectors('../myword2vec/word_vectors.data','utf8') mentions=filter(lambda d:d in review_vocab,mentions) mention_embedding=map(lambda x:review_embedding[review_vocab.index(x)],mentions) vector_size=len(mention_embedding[0]) for user in users.find(): x=numpy.zeros(vector_size) review=' '.join(map(lambda d:d['review']['review_general'],user['behaviors'])) for index,mention in enumerate(mentions): count=review.count(mention) x+=count*mention_embedding[index] if not x.any(): continue all_x.append(x) uids.append(user['_id']) finish_count+=1 bar.draw(value=finish_count) all_x=numpy.array(all_x) dump_user_vector(all_x,uids,'user_review_embedding')
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 y = [] for user in users.find(): if 'descriptions' not in user['information']: continue description = user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x = x.toarray() all_data_y = numpy.array(y) dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features=get_features(feature_file_name=feature_file_name) all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1) collection=Connection().jd.train_users bar=progress_bar(collection.count()) fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w') for index,user in enumerate(collection.find()): label=0 fout.write('%d'%label) uid_output.write('%s\n'%user['_id']) features=combine_features(user['mentions_1'],Counter(user['products'])) sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) for f,v in user['mentions_1_1'].items(): f=f+'_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f],v)) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def construct_test_set(attribute): all_features=get_features(feature_file_name=base_dir+'/features/mention.feature') collection=Connection().jd.test_users balance_params=get_balance_params(attribute,collection) print balance_params bar=progress_bar(collection.count()) fout=open(self_training_file_dir+'test_%s.data'%attribute,'w') for index,user in enumerate(collection.find()): features=dict(Counter(user['products'])) for m in user['mentions']: features[m]=user['mentions'][m] try: label=user['profile'][attribute].index(1) except Exception as e: continue if random.random()>balance_params[label]: continue sorted_feature=[] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f],features[f])) if len(sorted_feature)==0: continue fout.write('%d'%label) sorted_feature=sorted(sorted_feature,key=lambda d:d[0]) for f in sorted_feature: fout.write(' %s:%d'%f) fout.write('\n') bar.draw(index+1)
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = pickle.load(open('./tf.data')) x = [] y = [] for user in users.find(): name = user['screen_name'] finish_count += 1 if finish_count > 5000: break for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 3: continue try: x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]]) x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]]) except: continue if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x.append([x0, x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x, y, 'gender_name_simple.data')
def get_features_and_labels(attribute): from collections import Counter import random collection = Connection().jd.test_users features = dict() labels = dict() values = [] for user in collection.find(): #if attribute=='kids': # for w in [u'男朋友',u'女朋友',u'孩子',u'宝宝',u'儿子',u'女儿']: # if w in user['mentions']: # user['mentions'].pop(w) #if len(user['mentions'])==0: # continue if len(user['products']) == 0: continue if sum(user['profile'][attribute]) == 0: continue features[user['_id']] = dict(Counter(user['products'])) labels[user['_id']] = numpy.array(user['profile'][attribute], dtype='float32') values.append(str(labels[user['_id']])) values = Counter(values) min_value = 1.0 * min(values.values()) for key in values: values[key] = min_value / values[key] for uid in features.keys(): if random.random() > values[str(labels[uid])]: features.pop(uid) labels.pop(uid) return features, labels
def construct_mallet_data(profile_key): from pymongo import Connection from my_progress_bar import progress_bar from collections import Counter users=Connection().jd.weibo_users bar=progress_bar(users.count()) fout=open(MATRIXES_DIR+'mallet/construced_data.mallet','w') data=[] for index,user in enumerate(users.find()): try: label=user['profile'][profile_key].index(1) except: continue reviews=[] for behavior in user['behaviors']: #reviews.append('Pro'+str(behavior['item'])) reviews+=behavior['parsed_review']['review_general'] reviews=Counter(reviews) reviews=' '.join(map(lambda word:'%s:%d'%(word,reviews[word]),reviews.keys())) line='%s %d %s\n'%(user['_id'],label,reviews) data.append((label,line)) data=balance(data,target_index=0) #balanced_data=data for label,line in balanced_data: fout.write(line.encode('utf8')) bar.draw(index)
def get_all_uids(): from pymongo import Connection users=Connection().jd.weibo_users uids=[] for user in users.find({},{'_id':1}): uids.append(user['_id'].encode('utf8')) return uids
def construct_test_set(attribute): all_features = get_features(feature_file_name=base_dir + '/features/mention.feature') collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) all_features_1 = get_features(feature_file_name=base_dir + '/features/mention_1.feature', start_index=max(all_features.values()) + 1) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) features = combine_features(user['mentions_1'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = dict() for user in users.find(): name = user['screen_name'] finish_count += 1 for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 2: continue if user['information']['gender'] == 'm': gender = 1 else: gender = 0 for w in n[1:]: if w not in tf: tf[w] = [0, 0] tf[w][gender] += 1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def insert_age_vector(): from collections import Counter users=Connection().jd.weibo_users all_vec=[] for user in users.find(): profile=user['profile'] if user['birthday'] is None: age_vec=[0,0] profile['age']=age_vec users.update({'_id':user['_id']},{'$set':{'profile':profile}}) continue if u'年' not in user['birthday']: age_vec=[0,0] profile['age']=age_vec users.update({'_id':user['_id']},{'$set':{'profile':profile}}) continue age=user['birthday'] age=age[0:age.find(u'年')] if len(age)<4: age='19'+age age=int(age) if age<1950 or age>2010: age_vec=[0,0] profile['age']=age_vec users.update({'_id':user['_id']},{'$set':{'profile':profile}}) continue if age<1987: age_vec=[1,0] else: age_vec=[0,1] profile['age']=age_vec users.update({'_id':user['_id']},{'$set':{'profile':profile}}) all_vec.append(str(age_vec)) print Counter(all_vec)
def update_user_id(): GRAPH_DATA_DIR = '/mnt/data1/weibo_graph/' id_map_file = open(GRAPH_DATA_DIR + 'id_map.txt') uids = dict() total_count = 107628903 finish_count = 0 #bar=get_progressive_bar(total_count=total_count) for line in id_map_file: line = line.replace('\n', '').split(' ') uids[line[0]] = line[1] finish_count += 1 #bar.cursor.restore() #bar.draw(value=finish_count) #uids=set(uids) from pymongo import Connection users = Connection().user_profilling.users count = 0 finish_count = 0 u = set() for user in users.find({}, {'uid': True}): finish_count += 1 uid = user['uid'] u.add(uid) try: int_id = uids[uid] except Exception as e: continue users.update({'_id': user['_id']}, {'$set': {'int_id': int_id}}) #bar.cursor.restore() #bar.draw(value=finish_count) uids = set(uids.keys()) together = uids & u print len(together) print len(uids) print len(u)
def construct_all_data(): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.train_users bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w') uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w') for index, user in enumerate(collection.find()): features = dict(Counter(user['products'])) for m in user['mentions']: features[m] = user['mentions'][m] label = 0 fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%d' % f) fout.write('\n') bar.draw(index + 1)
def construct_train_set(attribute, training_count): ''' The format of labeled_feature_file is as the same as mallet ''' all_features = get_features(feature_file_name=feature_file_name) labeled_feature_file = open('%s/review_constraint_%s.constraints' % (labeled_feature_file_dir, attribute)) labeled_features = dict() for line in labeled_feature_file: line = line[:-1].split(' ') labeled_features[line[0].decode('utf8')] = map( lambda d: float(d.split(':')[1]), line[1:]) collection = Connection().jd.train_users bar = progress_bar(collection.count()) confidence = [] for index, user in enumerate(collection.find()): label_distributed = [1, 1] for f, value in user['mentions'].items(): if f in labeled_features: label_distributed[0] *= labeled_features[f][0] * value label_distributed[1] *= labeled_features[f][1] * value s = 1.0 * sum(label_distributed) label_distributed[0] /= s label_distributed[1] /= s if label_distributed[0] > label_distributed[1]: label = 0 elif label_distributed[0] < label_distributed[1]: label = 1 else: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature)) confidence.append( (user['_id'], label, abs(label_distributed[0] - label_distributed[1]), str_features)) bar.draw(index + 1) #confidence=sorted(confidence,key=lambda d:d[2],reverse=True) confidence0 = filter(lambda d: d[1] == 0, confidence) confidence1 = filter(lambda d: d[1] == 1, confidence) #dimention=min(len(confidence0),len(confidence1),training_count/2) #confidence0=confidence0#[:dimention] #confidence1=confidence1#[:dimention] print len(confidence0), len(confidence1) fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w') #for d in confidence0+confidence1: for d in confidence: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])
def construct_test_set(attribute): all_features = get_features(feature_file_name=feature_file_name) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = user['mentions'] #features=Counter(user['products']) #features=combine_features(user['mentions'],Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def update_min_max_sum(entity): ''' Used to insert min max and sum ''' collection = Connection().jd['train_%s_mentions' % entity] collection_user = Connection().jd['train_%ss' % entity] mentions = [ line[:-1].decode('utf8') for line in open('../features/mention.feature') ] min_d = dict() max_d = dict() sum_d = dict() sum_u_d = dict() for m in mentions: min_d[m] = float('inf') max_d[m] = -1 sum_d[m] = 0 sum_u_d[m] = 0 for user in collection_user.find(): for m in user['mentions']: v = user['mentions'][m] if v < min_d[m]: min_d[m] = v if v > max_d[m]: max_d[m] = v sum_d[m] += v sum_u_d[m] += 1 for m in mentions: collection.insert({ '_id': m, 'distribute': [min_d[m], max_d[m], sum_d[m], sum_u_d[m]] })
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users=Connection().user_profilling.users bar=get_progressive_bar(users.count()) corpus=[] finish_count=0 y=[] for user in users.find(): if 'descriptions' not in user['information']: continue description=user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender']=='m': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x=x.toarray() all_data_y=numpy.array(y) dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
def get_features_and_labels(attribute): from collections import Counter import random collection=Connection().jd.test_users features=dict() labels=dict() values=[] for user in collection.find(): #if attribute=='kids': # for w in [u'男朋友',u'女朋友',u'孩子',u'宝宝',u'儿子',u'女儿']: # if w in user['mentions']: # user['mentions'].pop(w) #if len(user['mentions'])==0: # continue if len(user['products'])==0: continue if sum(user['profile'][attribute])==0: continue features[user['_id']]=dict(Counter(user['products'])) labels[user['_id']]=numpy.array(user['profile'][attribute],dtype='float32') values.append(str(labels[user['_id']])) values=Counter(values) min_value=1.0*min(values.values()) for key in values: values[key]=min_value/values[key] for uid in features.keys(): if random.random()>values[str(labels[uid])]: features.pop(uid) labels.pop(uid) return features,labels
def test_calling_delete_on_a_message_returned_removes_it_from_mongodb( self): collection = Connection().karait_test.queue_test queue = Queue(database='karait_test', queue='queue_test') queue.write(Message({'foo': 1})) self.assertEqual(1, collection.find({}).count()) queue.read()[0].delete() self.assertEqual(0, len(queue.read()))
def output_user_product_graph(): fout = open(RAW_DATA_DIR + 'graph.data', 'w') collection = Connection().jd.train_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1) collection = Connection().jd.test_users bar = progress_bar(collection.count()) for index, user in enumerate(collection.find()): uid = user['_id'] for pid in user['products']: fout.write('%s %s\n' % (uid, pid)) bar.draw(index + 1)
def get_train_user_products(): collection=Connection().jd.train_users bar=progress_bar(collection.count()) user_products=dict() for index,user in enumerate(collection.find()): user_products[user['_id']]=dict(Counter(user['products'])) #user_products[user['_id']]=user['mentions'] bar.draw(index) return user_products
def f(l_min, l_max): from pymongo import Connection C = Connection(address).research C.authenticate(user, password) C = C.ellcurves for v in C.find({'level':{'$gte':level_min, '$lt':level_max}, 'sel2':{'$exists':False}}): sel2 = selmer2(eval(v['weq']), max_time) C.update({'_id':v['_id']}, {'$set':{'sel2':sel2}})
def get_companies_list(mongo_host, start_idx=None): col = Connection(mongo_host)['crunch']['company'] query = {} if start_idx: query['_id'] = {'$gte': start_idx} companies = col.find(query, {'_id': 1}).sort([ ('_id', pymongo.ASCENDING), ]) return companies
def construct_test_set(attribute): all_features = get_features(feature_file=feature_file_name) all_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=all_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=all_features_1) collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) print balance_params bar = progress_bar(collection.count()) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute, 'w') for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except Exception as e: continue if random.random() > balance_params[label]: continue features = combine_features(user['mentions_0'], Counter(user['products'])) sorted_feature = [] for f in features: if f not in all_features: continue sorted_feature.append((all_features[f], features[f])) for f, v in user['mentions_1_1'].items(): f = f + '_1' if f not in all_features_1: continue sorted_feature.append((all_features_1[f], v)) for f, v in Counter(user['review']).items(): if f not in review_featuers: continue sorted_feature.append((review_featuers[f], v)) if len(sorted_feature) == 0: continue fout.write('%d' % label) uid_output.write('%s\n' % user['_id']) keys = map(lambda d: d[0], sorted_feature) if not len(keys) == len(set(keys)): print Counter(keys).values() sorted_feature = sorted(sorted_feature, key=lambda d: d[0]) for f in sorted_feature: fout.write(' %s:%f' % f) fout.write('\n') bar.draw(index + 1)
def attribute_statistics(attribute): from collections import Counter print attribute collection = Connection().jd.test_users profiles = [] for user in collection.find(): if sum(user['profile'][attribute]) > 0: profiles.append(str(user['profile'][attribute])) print len(profiles) print Counter(profiles)
def get_train_uids(): collection = Connection().jd.train_users uids = set() for user in collection.find(): uids.add(user['_id']) collection = Connection().jd.train_users bar = progress_bar(len(uids)) for index, uid in enumerate(uids): collection.delete_one({'_id': uid}) bar.draw(index + 1)
def statistics(attribute, threshold=-1, feature_file_name=base_dir + '/features/mention.feature', show=False): import random collection = Connection().jd.test_users balance_params = get_balance_params(attribute, collection) all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) distribute = dict([f, [0., 0.]] for f in all_features) labels_distribute = [0., 0.] for index, user in enumerate(collection.find()): try: label = user['profile'][attribute].index(1) except: continue #if random.random()>balance_params[label]: # continue features = dict(user['mentions']) products = Counter(user['products']) for p in products: features[p] = products[p] if len(features) < 10: continue for f in features: if f in distribute: distribute[f][label] += 1 #features[f] labels_distribute[label] += 1 bar.draw(index) for f in distribute.keys(): if sum(distribute[f]) < threshold: distribute.pop(f) print labels_distribute for f in distribute: distribute[f][0] /= labels_distribute[0] distribute[f][1] /= labels_distribute[1] for f in distribute: s = sum(distribute[f]) distribute[f][0] /= s distribute[f][1] /= s if not show: return distribute #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items()) distribute = sorted(distribute.items(), key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) / (sum(d[1]) + 0.1)), reverse=True) #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True) print '' for d in distribute[:50]: print '%s 0:%0.3f 1:%0.3f' % ( d[0].encode('utf8'), (d[1][0] + 0.1) / (sum(d[1]) + 0.1), 1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1), )
def f(l_min, l_max): from pymongo import Connection C = Connection(address).research C.authenticate(user, password) C = C.ellcurves for v in C.find({'level':{'$gte':level_min, '$lt':level_max}, 'number':1, 'ap':{'$exists':False}}): E = pari('ellinit(%s,1)'%v['weq']) ap = dict([(str(p),int(E.ellap(p))) for p in P]) C.update({'_id':v['_id']}, {'$set':{'ap':ap}})
def get_word_count(): collection = Connection().jd.train_users count = dict() for user in collection.find(): for m, v in user['mentions'].items(): if m in count: count[m] += v else: count[m] = v for m, v in sorted(count.items(), key=lambda d: d[1]): print m, v
def count_occur(words): from pymongo import Connection occur=dict([(w, 0) for w in words]) collection=Connection().jd.train_users keys=set(occur.keys()) for user in collection.find(): for w in set(user['mentions'].keys())&keys: if w in occur: occur[w]+=1 for w in words: v=occur[w] print '%s\n(%0.2f\\%%)'%(w.encode('utf8'),100.0*v/100000)
def count_attribute(attribute): from pymongo import Connection from collections import Counter collection=Connection().jd.test_users a=[] for user in collection.find(): try: label=user['profile'][attribute].index(1) except: continue a.append(label) print Counter(a),len(a)