def output_graph_matrix(): from pymongo import Connection users=Connection().user_profilling.users graph=Connection().user_profilling.graph_embedding print graph.count() bar=get_progressive_bar(users.count()) x=[] y=[] finish_count=0 uids=[] for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}): finish_count+=1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding=graph.find_one({'_id':user['int_id']}) if user_embedding is None: print user_embedding continue gender=user['information']['gender'] if gender=='f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x,y,uids,'user_graph_vector.data')
def output_graph_matrix(): from pymongo import Connection users = Connection().user_profilling.users graph = Connection().user_profilling.graph_embedding print graph.count() bar = get_progressive_bar(users.count()) x = [] y = [] finish_count = 0 uids = [] for user in users.find({'int_id': { '$exists': True }}, { 'information': 1, 'int_id': 1 }): finish_count += 1 print finish_count #bar.cursor.restore() #bar.draw(value=finish_count) user_embedding = graph.find_one({'_id': user['int_id']}) if user_embedding is None: print user_embedding continue gender = user['information']['gender'] if gender == 'f': y.append(0) else: y.append(1) x.append(user_embedding['embedding']) uids.append(user['information']['uid']) #dump_train_valid_test(x,y,'gender_graph.data') dump_user_vector(x, y, uids, 'user_graph_vector.data')
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=pickle.load(open('./tf.data')) x=[] y=[] for user in users.find(): name=user['screen_name'] finish_count+=1 if finish_count>5000: break for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<3: continue try: x0=1.0*tf[n[1]][0]/sum(tf[n[1]]) x1=1.0*tf[n[2]][0]/sum(tf[n[2]]) except: continue if user['information']['gender']=='m': y.append(1) else: y.append(0) x.append([x0,x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x,y,'gender_name_simple.data')
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users=Connection().user_profilling.users bar=get_progressive_bar(users.count()) corpus=[] finish_count=0 y=[] for user in users.find(): if 'descriptions' not in user['information']: continue description=user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender']=='m': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x=x.toarray() all_data_y=numpy.array(y) dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = dict() for user in users.find(): name = user['screen_name'] finish_count += 1 for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 2: continue if user['information']['gender'] == 'm': gender = 1 else: gender = 0 for w in n[1:]: if w not in tf: tf[w] = [0, 0] tf[w][gender] += 1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def output_description_matrix(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 y = [] for user in users.find(): if 'descriptions' not in user['information']: continue description = user['information']['descriptions'] corpus.append(get_str_description(description)) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x = vectorizer.fit_transform(corpus) all_data_x = x.toarray() all_data_y = numpy.array(y) dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def output_name_matrix_of_two_words(): from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = pickle.load(open('./tf.data')) x = [] y = [] for user in users.find(): name = user['screen_name'] finish_count += 1 if finish_count > 5000: break for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 3: continue try: x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]]) x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]]) except: continue if user['information']['gender'] == 'm': y.append(1) else: y.append(0) x.append([x0, x1]) bar.cursor.restore() bar.draw(value=finish_count) dump_train_valid_test(x, y, 'gender_name_simple.data')
def get_tf(): from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=dict() for user in users.find(): name=user['screen_name'] finish_count+=1 for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<2: continue if user['information']['gender']=='m': gender=1 else: gender=0 for w in n[1:]: if w not in tf: tf[w]=[0,0] tf[w][gender]+=1 bar.cursor.restore() bar.draw(value=finish_count) return tf
def output_text_matrix_from_bag_of_words(): from pymongo import Connection words={} f=open('./word.feature').readlines() for i in range(0,len(f)): words[f[i].decode('utf8')[0:-1]]=i all_data_x=[] all_data_y=[] index=0 #进度条相关参数 users=Connection().user_profilling.users total_count=users.count() bar=get_progressive_bar(total_count) finish_count=0 #for line in open('./users.data'): uids=[] for user in users.find(): #user=parse_user(line) correct_status=0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status+=1 if correct_status<50: continue length=[] text=numpy.zeros((len(words))) for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: if word not in words: continue text[words[word]]+=1.0 if not text.any(): continue text_vector=text if user['information']['gender']=='m': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index+=1 finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x=numpy.array(all_data_x) all_data_y=numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data') dump_user_vector(all_data_x,all_data_y,uids,'user_text_bag_words.data')
def output_text_matrix_from_bag_of_words(): from pymongo import Connection words = {} f = open('./word.feature').readlines() for i in range(0, len(f)): words[f[i].decode('utf8')[0:-1]] = i all_data_x = [] all_data_y = [] index = 0 #进度条相关参数 users = Connection().user_profilling.users total_count = users.count() bar = get_progressive_bar(total_count) finish_count = 0 #for line in open('./users.data'): uids = [] for user in users.find(): #user=parse_user(line) correct_status = 0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status += 1 if correct_status < 50: continue length = [] text = numpy.zeros((len(words))) for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: if word not in words: continue text[words[word]] += 1.0 if not text.any(): continue text_vector = text if user['information']['gender'] == 'm': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index += 1 finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x = numpy.array(all_data_x) all_data_y = numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data') dump_user_vector(all_data_x, all_data_y, uids, 'user_text_bag_words.data')
def output_text_matrix_from_vector(): from pymongo import Connection users=Connection().user_profilling.users word_vectors=get_vectors('/mnt/data1/adoni/word_vectors.bin') word_count=600 all_data_x=[] all_data_y=[] index=0 #进度条相关参数 total_count=20000 bar=get_progressive_bar(total_count) finish_count=0 uids=[] for user in users.find(): correct_status=0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status+=1 if correct_status<50: continue length=[] text=[] for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: try: text.append(word_vectors[word]) except Exception as e: continue text_vector=get_text_vector_for_nn(text,window_size=2) if text_vector is None: continue if user['information']['gender']=='m': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index+=1 finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x=numpy.array(all_data_x) all_data_y=numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_vector.data') dump_user_vector(all_data_x,all_data_y,uids,'user_text_vectors.data')
def output_text_matrix_from_vector(): from pymongo import Connection users = Connection().user_profilling.users word_vectors = get_vectors('/mnt/data1/adoni/word_vectors.bin') word_count = 600 all_data_x = [] all_data_y = [] index = 0 #进度条相关参数 total_count = 20000 bar = get_progressive_bar(total_count) finish_count = 0 uids = [] for user in users.find(): correct_status = 0 for status in user['statuses']: if is_not_good_status(status): continue else: correct_status += 1 if correct_status < 50: continue length = [] text = [] for status in user['statuses']: if is_not_good_status(status): continue for word in status['text']: try: text.append(word_vectors[word]) except Exception as e: continue text_vector = get_text_vector_for_nn(text, window_size=2) if text_vector is None: continue if user['information']['gender'] == 'm': all_data_y.append(1) else: all_data_y.append(0) all_data_x.append(text_vector) uids.append(user['information']['uid']) index += 1 finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) all_data_x = numpy.array(all_data_x) all_data_y = numpy.array(all_data_y) #dump_train_valid_test(all_data_x,all_data_y,'gender_text_vector.data') dump_user_vector(all_data_x, all_data_y, uids, 'user_text_vectors.data')
def output_name_matrix(): from sklearn.feature_extraction.text import CountVectorizer lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 3), min_df=1) from pymongo import Connection users = Connection().user_profilling.users bar = get_progressive_bar(users.count()) corpus = [] finish_count = 0 uids = [] y = [] for user in users.find(): #if finish_count>1000: # break name = user['screen_name'] normal_name = [] for n in name: if n[0] in lastnames: normal_name.append(n[1:]) else: continue #normal_name.append(n) corpus.append(' '.join(normal_name)) finish_count += 1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender'] == 'm': y.append(1) else: y.append(0) uids.append(user['information']['uid']) x = vectorizer.fit_transform(corpus) fe = vectorizer.get_feature_names() for f in fe: print f.encode('utf8') all_data_x = x.toarray() all_data_y = numpy.array(y) #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data') dump_user_vector(all_data_x, all_data_y, uids, 'user_name_vector.data')
def plot(): from matplotlib import pyplot as plt x_m = [] y_m = [] x_f = [] y_f = [] from helper import get_progressive_bar from pymongo import Connection users = Connection().user_profilling.users lastnames = [ name.replace('\n', '').decode('utf8') for name in open('./lastname') ] bar = get_progressive_bar(users.count()) finish_count = 0 tf = pickle.load(open('./tf.data')) for user in users.find(): name = user['screen_name'] finish_count += 1 if finish_count > 5000: break for n in name: if n[0] not in lastnames or len(n) > 3 and len(n) < 3: continue try: x = 1.0 * tf[n[1]][0] / sum(tf[n[1]]) y = 1.0 * tf[n[2]][0] / sum(tf[n[2]]) except: continue if user['information']['gender'] == 'm': x_m.append(x) y_m.append(y) else: x_f.append(x) y_f.append(y) bar.cursor.restore() bar.draw(value=finish_count) plt.scatter(x_m, y_m, c='red', label='Male', alpha=0.3) plt.scatter(x_f, y_f, c='green', label='Female', alpha=0.3) plt.legend() plt.grid(True) plt.show()
def output_name_matrix(): from sklearn.feature_extraction.text import CountVectorizer lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,3),min_df=1) from pymongo import Connection users=Connection().user_profilling.users bar=get_progressive_bar(users.count()) corpus=[] finish_count=0 uids=[] y=[] for user in users.find(): #if finish_count>1000: # break name=user['screen_name'] normal_name=[] for n in name: if n[0] in lastnames: normal_name.append(n[1:]) else: continue #normal_name.append(n) corpus.append(' '.join(normal_name)) finish_count+=1 bar.cursor.restore() bar.draw(value=finish_count) if user['information']['gender']=='m': y.append(1) else: y.append(0) uids.append(user['information']['uid']) x = vectorizer.fit_transform(corpus) fe=vectorizer.get_feature_names() for f in fe: print f.encode('utf8') all_data_x=x.toarray() all_data_y=numpy.array(y) #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data') dump_user_vector(all_data_x,all_data_y,uids,'user_name_vector.data')
def plot(): from matplotlib import pyplot as plt x_m=[] y_m=[] x_f=[] y_f=[] from helper import get_progressive_bar from pymongo import Connection users=Connection().user_profilling.users lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')] bar=get_progressive_bar(users.count()) finish_count=0 tf=pickle.load(open('./tf.data')) for user in users.find(): name=user['screen_name'] finish_count+=1 if finish_count>5000: break for n in name: if n[0] not in lastnames or len(n)>3 and len(n)<3: continue try: x=1.0*tf[n[1]][0]/sum(tf[n[1]]) y=1.0*tf[n[2]][0]/sum(tf[n[2]]) except: continue if user['information']['gender']=='m': x_m.append(x) y_m.append(y) else: x_f.append(x) y_f.append(y) bar.cursor.restore() bar.draw(value=finish_count) plt.scatter(x_m,y_m,c='red', label='Male', alpha=0.3) plt.scatter(x_f,y_f,c='green', label='Female', alpha=0.3) plt.legend() plt.grid(True) plt.show()