def load_data(aminer, linkedin): mysql = Mysql() mongo = Mongo() import pickle aminer = pickle.load(open("D:\\Users\\chenwei\\script\\aminer_two")) linkedin = pickle.load(open("D:\\Users\\chenwei\\script\\linkedin_two_filter")) print aminer.number_of_nodes() print linkedin.number_of_nodes() ids = [] profiles = [] type = [] index= 0 for i in aminer.nodes(): verbose.index(index) index+=1 ids.append(int(i)) profile = "" try: profile = mysql.get_person_aminer_profile(i) except Exception,e: print e try: print i except Exception,e: print e
def get_profile_similarity(): col = mongo.db['labeled_data'] idx = 0 for item in col.find(): verbose.index(idx) idx+=1 try: aminer = item['aminer_profile_str'] linkedin = item['linkedin_profile_str'] sim = {} sim['levenshtein'] = levenshtein.distance_levenshtein_distance(aminer, linkedin) sim['levenshtein_ratio'] = levenshtein.distance_levenshtein_ratio(aminer, linkedin) sim['jaro'] = levenshtein.distance_jaro(aminer, linkedin) sim['jaro_winkler'] = levenshtein.distance_levenshtein_jaro_winkler(aminer, linkedin) sim['unigrams'] = levenshtein.distance_unigrams_same(aminer, linkedin) sim['bigrams'] = levenshtein.distance_bigrams_same(aminer, linkedin) sim['trigrams'] = levenshtein.distance_trigrams_same(aminer, linkedin) sim['cosine'] = levenshtein.distance_cosine_measure(aminer, linkedin) sim['words_in_common'] = levenshtein.number_of_words_in_common(aminer, linkedin) sim['words_in_common_ratio'] = float(sim['words_in_common'])/(len(aminer)+len(linkedin)) sim['valid'] = True if aminer == "" or linkedin == "": sim['valid'] = False item['similarity'] = sim col.save(item) except Exception, e: print e print item['_id']
def get_labeled_data_linkedin_profile(): index = 0 col = mongo.db['labeled_data'] for item in col.find(): verbose.index(index) index+=1 item['linkedin_profile_str']=mongo.get_person_linkedin_profile(item['linkedin']) col.save(item)
def get_labeled_data_aminer_profile(): index = 0 col = mongo.db['labeled_data'] for item in col.find(): verbose.index(index) index+=1 item['aminer_profile_str']=mysql.get_person_aminer_profile(item['aminer']) col.save(item)
def plot_profile_similarity(): col = mongo.db["labeled_data"] idx = 0 sim = {} sim["levenshtein"] = {} sim["levenshtein_ratio"] = {} sim["jaro"] = {} sim["jaro_winkler"] = {} sim["unigrams"] = {} sim["bigrams"] = {} sim["trigrams"] = {} sim["cosine"] = {} sim["words_in_common"] = {} sim["words_in_common_ratio"] = {} for item in col.find(): verbose.index(idx) idx += 1 if item.has_key("similarity"): if item.has_key("valid"): if item["valid"] == True: sim["levenshtein"][item["_id"]] = item["similarity"]["levenshtein"] sim["levenshtein_ratio"][item["_id"]] = item["similarity"]["levenshtein_ratio"] sim["jaro"][item["_id"]] = item["similarity"]["jaro"] sim["jaro_winkler"][item["_id"]] = item["similarity"]["jaro_winkler"] sim["unigrams"][item["_id"]] = item["similarity"]["unigrams"] sim["bigrams"][item["_id"]] = item["similarity"]["bigrams"] sim["trigrams"][item["_id"]] = item["similarity"]["trigrams"] sim["cosine"][item["_id"]] = item["similarity"]["trigrams"] sim["words_in_common"][item["_id"]] = item["similarity"]["words_in_common"] sim["words_in_common_ratio"][item["_id"]] = item["similarity"]["words_in_common_ratio"] measures = [ "levenshtein", "levenshtein_ratio", "jaro", "jaro_winkler", "unigrams", "bigrams", "trigrams", "cosine", "words_in_common", "words_in_common_ratio", ] lines = [] for s in measures: fig = plt.figure(figsize=(30, 10)) plt.xlabel(s) plt.ylabel("similarity") plt.plot(sim[s].values()) plt.savefig(s + ".png") plt.close() for s in measures: fig = plt.figure(figsize=(20, 10)) plt.xlabel(s) plt.hist(sim[s].values(), 100, facecolor="green") plt.savefig(s + "-hist.png") plt.close()
def check_urls(): labeled = mongo.db["labeled_data"] temp = mongo.db['temp_person_profiles'] l_urls = [] t_urls = [] index = 0 count = 0 for item in labeled.find(): verbose.index(index, 1000) index+=1 l_urls.append(item['url']) for item in temp.find(): t_urls.append(item['url'])
def get_alsoview_url(): urls = [] index = 0 for item in mongo.db['temp_alsoview_person_profiles'].find(): verbose.index(index) index+=1 if item.has_key('also_view'): for al in item['also_view']: x = mongo.db['temp_alsoview_person_profiles'].find({'_id':al['url']}) if x.count()==0: urls.append(al['linkedin_id']) out = open('urls','w') for u in set(urls): out.write(u+'\n') out.close()
def check_alsoview_in_db(): index = 0 count = 0 for item in mongo.person_profiles.find(limit=10): verbose.index(index) index+=1 if item.has_key('also_view'): for alsoview in item['also_view']: x = mongo.person_profiles.find({"_id":alsoview['url']}) if x.count()>0: alsoview['crawled']=True verbose.debug(alsoview['url']) verbose.debug(count) else: alsoview['crawled']=False mongo.person_profiles.save(item)
def plot_profile_similarity(): col = mongo.db['labeled_data'] idx = 0 sim = {} sim['levenshtein'] = {} sim['levenshtein_ratio'] = {} sim['jaro'] = {} sim['jaro_winkler'] = {} sim['unigrams'] = {} sim['bigrams'] = {} sim['trigrams'] = {} sim['cosine'] = {} sim['words_in_common'] = {} sim['words_in_common_ratio'] = {} for item in col.find(): verbose.index(idx) idx+=1 if item.has_key('similarity'): if item.has_key('valid'): if item['valid'] == True: sim['levenshtein'][item['_id']] = item['similarity']['levenshtein'] sim['levenshtein_ratio'][item['_id']] = item['similarity']['levenshtein_ratio'] sim['jaro'][item['_id']] = item['similarity']['jaro'] sim['jaro_winkler'][item['_id']] = item['similarity']['jaro_winkler'] sim['unigrams'][item['_id']] = item['similarity']['unigrams'] sim['bigrams'][item['_id']] = item['similarity']['bigrams'] sim['trigrams'][item['_id']] = item['similarity']['trigrams'] sim['cosine'][item['_id']] = item['similarity']['trigrams'] sim['words_in_common'][item['_id']] = item['similarity']['words_in_common'] sim['words_in_common_ratio'][item['_id']] = item['similarity']['words_in_common_ratio'] measures = ['levenshtein','levenshtein_ratio','jaro','jaro_winkler','unigrams', 'bigrams','trigrams','cosine','words_in_common','words_in_common_ratio'] lines = [] for s in measures: fig = plt.figure(figsize=(30,10)) plt.xlabel(s) plt.ylabel("similarity") plt.plot(sim[s].values()) plt.savefig(s+".png") plt.close() for s in measures: fig = plt.figure(figsize=(20,10)) plt.xlabel(s) plt.hist(sim[s].values(),100,facecolor='green') plt.savefig(s+"-hist.png") plt.close()
def plot_duplicated_names(): dup = {} index = 0 for item in mongo.db["aminer_linkedin_1123"].find(): verbose.index(index) index += 1 count = 0 if item.has_key("aminer"): count += len(item["aminer"]) if item.has_key("linkedin"): count += len(item["linkedin"]) dup[item["name"]] = count dup_frq = {} for i in dup: if not dup_frq.has_key(dup[i]): dup_frq[dup[i]] = 0 dup_frq[dup[i]] += 1
def plot_duplicated_names(): dup = {} index = 0 for item in mongo.db['aminer_linkedin_1123'].find(): verbose.index(index) index+=1 count = 0 if item.has_key('aminer'): count+=len(item['aminer']) if item.has_key('linkedin'): count+=len(item['linkedin']) dup[item['name']]=count dup_frq = {} for i in dup: if not dup_frq.has_key(dup[i]): dup_frq[dup[i]] = 0 dup_frq[dup[i]]+=1
def check_if_labeled_data_exist(): col = mongo.db["labeled_data"] col61 = mongo.db['temp_person_profiles'] not_in_db = [] ids = [] index = 0 count = 0 for item in col.find(): verbose.index(index, 1000) index+=1 query = col61.find({"_id":item['linkedin']}) if query.count() == 0: url_q = col61.find({'url':item['url']}) if url_q.count() == 0: verbose.debug("not in") verbose.debug(item['linkedin']) not_in_db.append(item['url']) out = codecs.open('urls','w','utf-8') for u in not_in_db: out.write(u+'\n') out.close()
def get_profile_similarity(): col = mongo.db["labeled_data"] idx = 0 for item in col.find(): verbose.index(idx) idx += 1 try: aminer = item["aminer_profile_str"] linkedin = item["linkedin_profile_str"] print aminer print linkedin aminer, linkedin = levenshtein.string_transform(aminer, linkedin) print aminer print linkedin sim = {} sim["levenshtein"] = levenshtein.distance_levenshtein_distance(aminer, linkedin) sim["levenshtein_ratio"] = levenshtein.distance_levenshtein_ratio(aminer, linkedin) sim["jaro"] = levenshtein.distance_jaro(aminer, linkedin) sim["jaro_winkler"] = levenshtein.distance_levenshtein_jaro_winkler(aminer, linkedin) sim["unigrams"] = levenshtein.distance_unigrams_same(aminer, linkedin) sim["bigrams"] = levenshtein.distance_bigrams_same(aminer, linkedin) sim["trigrams"] = levenshtein.distance_trigrams_same(aminer, linkedin) sim["cosine"] = levenshtein.distance_cosine_measure(aminer, linkedin) sim["words_in_common_number"], sim["words_in_common"] = levenshtein.number_of_words_in_common( aminer, linkedin ) sim["words_in_common_ratio"] = float(sim["words_in_common"]) / (len(aminer) + len(linkedin)) sim["valid"] = True if aminer == "" or linkedin == "": sim["valid"] = False item["similarity"] = sim col.save(item) except Exception, e: print e print item["_id"]
index+=1 ids.append(int(i)) profile = "" try: profile = mysql.get_person_aminer_profile(i) except Exception,e: print e try: print i except Exception,e: print e profiles.append(UnicodeDammit(profile).markup) type.append(0) index=0 for i in linkedin.nodes(): verbose.index(index) index+=1 ids.append(i) profile = "" try: profile = mongo.get_person_linkedin_profile(i) except Exception,e: print e try: print i except Exception,e: print e profiles.append(profile) type.append(1) vectorizer = CountVectorizer(stop_words='english') transformer = TfidfTransformer()