def split_linkedin_dump(): skip = 2100000 count = 0 log = codecs.open("C:\\data\\log"+str(skip)+".txt",'w', encoding="utf-8") id_map = codecs.open("C:\\data\\idmap"+str(skip)+".txt",'w', encoding="utf-8") linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8") out = "" linkedin_dump.next() for line in linkedin_dump: x = 0 if count < skip: count+=1 if count % 10000 == 0: print count continue print str(count)+':'+str(len(line)) log.write(str(count)+' '+str(len(line))) if line[0] == '"': x = line.find('",') log.write(str(count)+' '+line[1:x]+'\n') verbose.debug(str(count)+' '+line[1:x]) id_map.write(str(count)+' '+line[1:x]+'\n') count+=1 try: out = codecs.open("C:\\data\\linkedin\\"+line[1:x].strip().replace('"'," ").split('?')[0],'w', encoding="utf-8") except Exception, e: print e else: log.write("[EXCEPTION]"+str(count)+":"+line+'\n') out.write(line[x:])
def plot_hindex(): import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt dump = open("E:\\My Projects\\Eclipse Workspace\\CrossLinking\\src\\preprocess\\lenin_data") data = pickle.load(dump) ranks = [] verbose.debug("data loaded") for d in data: ranks.append(d["rank"]) sranks = sorted(ranks) mu, sigma = 100, 15 # x = mu + sigma*np.random.randn(10000) x = range(0, len(sranks)) # the histogram of the data n, bins, patches = plt.hist(x, 50, normed=1, facecolor="green", alpha=0.75) # add a 'best fit' line y = sranks # mlab.normpdf( bins, mu, sigma) l = plt.plot(bins, y, "r--", linewidth=1) plt.xlabel("Smarts") plt.ylabel("Probability") plt.title(r"$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$") plt.axis([40, 160, 0, 0.03]) plt.grid(True) verbose.debug("show") plt.show()
def check_werid_url(): col = mongo.db["lenin_label_data"] aid = [] for item in col.find(): if 'view' in item['linkedin']: if item['rank']<40000 and item['flag']=='1': verbose.debug(item['aminer']) aid.append(item['aminer'])
def check_alsoview_in_db(): index = 0 count = 0 for item in mongo.person_profiles.find(limit=10): verbose.index(index) index+=1 if item.has_key('also_view'): for alsoview in item['also_view']: x = mongo.person_profiles.find({"_id":alsoview['url']}) if x.count()>0: alsoview['crawled']=True verbose.debug(alsoview['url']) verbose.debug(count) else: alsoview['crawled']=False mongo.person_profiles.save(item)
def check_if_labeled_data_exist(): col = mongo.db["labeled_data"] col61 = mongo.db['temp_person_profiles'] not_in_db = [] ids = [] index = 0 count = 0 for item in col.find(): verbose.index(index, 1000) index+=1 query = col61.find({"_id":item['linkedin']}) if query.count() == 0: url_q = col61.find({'url':item['url']}) if url_q.count() == 0: verbose.debug("not in") verbose.debug(item['linkedin']) not_in_db.append(item['url']) out = codecs.open('urls','w','utf-8') for u in not_in_db: out.write(u+'\n') out.close()
def plot_labeled_linkedin_degree(): col = mongo.db["temp_person_profiles"] degree = [] missed = [] for item in col.find(): if item.has_key("also_view"): deg = 0 for i in item["also_view"]: x = mongo.db["temp_person_profiles"].find({"_id": i["url"]}) if x.count() > 0: deg += 1 else: x = mongo.db["temp_alsoview_person_profiles"].find({"_id": i["url"]}) if x.count() > 0: deg += 1 missed.append(i["linkedin_id"]) verbose.debug(item["_id"]) degree.append(deg) else: degree.append(0) plt.hist(degree, 10, facecolor="green") plt.show()
def plot_labeled_linkedin_degree(): col = mongo.db['temp_person_profiles'] degree = [] missed = [] for item in col.find(): if item.has_key('also_view'): deg = 0 for i in item['also_view']: x = mongo.db['temp_person_profiles'].find({'_id':i['url']}) if x.count()>0: deg +=1 else: x = mongo.db['temp_alsoview_person_profiles'].find({'_id':i['url']}) if x.count()>0: deg +=1 missed.append(i['linkedin_id']) verbose.debug(item['_id']) degree.append(deg) else: degree.append(0) plt.hist(degree,10, facecolor='green') plt.show()
def compare(): col = mongo.db["aminer_linkedin_1123"] aid = [] lid = [] index = 0 for item in col.find(): if index % 1000 == 0: print index index+=1 if not item.has_key('aminer'): verbose.debug('aminer '+item['name']) else: for a in item['aminer']: aid.append(a['id']) if not item.has_key('linkedin'): verbose.debug('linkedin '+item['name']) else: for l in item['linkedin']: lid.append(l['url']) dump_aid = open('aminer_linkedin_1123_aminer_id','w') pickle.dump(aid,dump_aid) dump_lid = open('aminer_linkedin_1123_linkedin_url','w') pickle.dump(lid,dump_lid) col = mongo.db['aminer_linkedin_labeled_1124'] labeled_aid = [] labeled_lid = [] labeled_data = col.find({'$or':[{'labels.homepage_match':True}, {'labels.domain_match':True}, {'labels.aff_match':True}]}) index = 0 print "start" for item in labeled_data: if index % 1000 == 0: print index index+=1 verbose.debug(item['name']+' '+str(item['_id'])) labeled_aid.append(item['aminer']['id']) labeled_lid.append(item['linkedin']['url']) x = [] for a in labeled_aid: x.append(str(a)) dump = open("E:\\My Projects\\Eclipse Workspace\\CrossLinking\\src\\preprocess\\lenin_data") data = pickle.load(dump) xaid= [] xlid = [] for person in data: xaid.append(person['aminer']) xlid.append(person['linkedin'])