Beispiel #1
0
def load_data(aminer, linkedin):
    mysql = Mysql()
    mongo = Mongo()
    import pickle
    aminer = pickle.load(open("D:\\Users\\chenwei\\script\\aminer_two"))
    linkedin = pickle.load(open("D:\\Users\\chenwei\\script\\linkedin_two_filter"))
    print aminer.number_of_nodes()
    print linkedin.number_of_nodes()
    ids = []
    profiles = []
    type = []
    
    index= 0
    for i in aminer.nodes():
        verbose.index(index)
        index+=1
        ids.append(int(i))
        profile = ""
        try:
            profile = mysql.get_person_aminer_profile(i)
        except Exception,e:
            print e
            try:
                print i
            except Exception,e:
                print e
def get_profile_similarity():
    col = mongo.db['labeled_data']
    idx = 0
    for item in col.find():
        verbose.index(idx)
        idx+=1
        try:
            aminer = item['aminer_profile_str']
            linkedin = item['linkedin_profile_str']
            sim = {}
            sim['levenshtein'] = levenshtein.distance_levenshtein_distance(aminer, linkedin)
            sim['levenshtein_ratio'] = levenshtein.distance_levenshtein_ratio(aminer, linkedin)
            sim['jaro'] = levenshtein.distance_jaro(aminer, linkedin)
            sim['jaro_winkler'] = levenshtein.distance_levenshtein_jaro_winkler(aminer, linkedin)
            sim['unigrams'] = levenshtein.distance_unigrams_same(aminer, linkedin)
            sim['bigrams'] = levenshtein.distance_bigrams_same(aminer, linkedin)
            sim['trigrams'] = levenshtein.distance_trigrams_same(aminer, linkedin)
            sim['cosine'] = levenshtein.distance_cosine_measure(aminer, linkedin)
            sim['words_in_common'] = levenshtein.number_of_words_in_common(aminer, linkedin)
            sim['words_in_common_ratio'] = float(sim['words_in_common'])/(len(aminer)+len(linkedin))
            sim['valid'] = True
            if aminer == "" or linkedin == "":
                sim['valid'] = False
            item['similarity'] = sim
            
            col.save(item)
        except Exception, e:
            print e
            print item['_id']
def get_labeled_data_linkedin_profile():
    index = 0
    col = mongo.db['labeled_data']
    for item in col.find():
        verbose.index(index)
        index+=1
        item['linkedin_profile_str']=mongo.get_person_linkedin_profile(item['linkedin'])
        col.save(item)
def get_labeled_data_aminer_profile():
    index = 0
    col = mongo.db['labeled_data']
    for item in col.find():
        verbose.index(index)
        index+=1
        item['aminer_profile_str']=mysql.get_person_aminer_profile(item['aminer'])
        col.save(item)
def plot_profile_similarity():
    col = mongo.db["labeled_data"]
    idx = 0
    sim = {}
    sim["levenshtein"] = {}
    sim["levenshtein_ratio"] = {}
    sim["jaro"] = {}
    sim["jaro_winkler"] = {}
    sim["unigrams"] = {}
    sim["bigrams"] = {}
    sim["trigrams"] = {}
    sim["cosine"] = {}
    sim["words_in_common"] = {}
    sim["words_in_common_ratio"] = {}
    for item in col.find():
        verbose.index(idx)
        idx += 1
        if item.has_key("similarity"):
            if item.has_key("valid"):
                if item["valid"] == True:
                    sim["levenshtein"][item["_id"]] = item["similarity"]["levenshtein"]
                    sim["levenshtein_ratio"][item["_id"]] = item["similarity"]["levenshtein_ratio"]
                    sim["jaro"][item["_id"]] = item["similarity"]["jaro"]
                    sim["jaro_winkler"][item["_id"]] = item["similarity"]["jaro_winkler"]
                    sim["unigrams"][item["_id"]] = item["similarity"]["unigrams"]
                    sim["bigrams"][item["_id"]] = item["similarity"]["bigrams"]
                    sim["trigrams"][item["_id"]] = item["similarity"]["trigrams"]
                    sim["cosine"][item["_id"]] = item["similarity"]["trigrams"]
                    sim["words_in_common"][item["_id"]] = item["similarity"]["words_in_common"]
                    sim["words_in_common_ratio"][item["_id"]] = item["similarity"]["words_in_common_ratio"]
    measures = [
        "levenshtein",
        "levenshtein_ratio",
        "jaro",
        "jaro_winkler",
        "unigrams",
        "bigrams",
        "trigrams",
        "cosine",
        "words_in_common",
        "words_in_common_ratio",
    ]
    lines = []
    for s in measures:
        fig = plt.figure(figsize=(30, 10))
        plt.xlabel(s)
        plt.ylabel("similarity")
        plt.plot(sim[s].values())
        plt.savefig(s + ".png")
        plt.close()

    for s in measures:
        fig = plt.figure(figsize=(20, 10))
        plt.xlabel(s)
        plt.hist(sim[s].values(), 100, facecolor="green")
        plt.savefig(s + "-hist.png")
        plt.close()
def check_urls():
    labeled = mongo.db["labeled_data"]
    temp = mongo.db['temp_person_profiles']
    l_urls = []
    t_urls = []
    index = 0
    count = 0
    for item in labeled.find():
        verbose.index(index, 1000)
        index+=1
        l_urls.append(item['url'])
    for item in temp.find():
        t_urls.append(item['url'])
def get_alsoview_url():
    urls = []
    index = 0
    for item in mongo.db['temp_alsoview_person_profiles'].find():
        verbose.index(index)
        index+=1
        if item.has_key('also_view'):
            for al in item['also_view']:
                x = mongo.db['temp_alsoview_person_profiles'].find({'_id':al['url']})
                if x.count()==0:
                    urls.append(al['linkedin_id'])
    out = open('urls','w')
    for u in set(urls):
        out.write(u+'\n')
    out.close()
def check_alsoview_in_db():
    index = 0
    count = 0
    for item in mongo.person_profiles.find(limit=10):
        verbose.index(index)
        index+=1
        if item.has_key('also_view'):
            for alsoview in item['also_view']:
                x = mongo.person_profiles.find({"_id":alsoview['url']})
                if x.count()>0:
                    alsoview['crawled']=True
                    verbose.debug(alsoview['url'])
                    verbose.debug(count)
                else:
                    alsoview['crawled']=False
            mongo.person_profiles.save(item)
def plot_profile_similarity():
    col = mongo.db['labeled_data']
    idx = 0
    sim = {}
    sim['levenshtein'] = {}
    sim['levenshtein_ratio'] = {}
    sim['jaro'] = {}
    sim['jaro_winkler'] = {}
    sim['unigrams'] = {}
    sim['bigrams'] = {}
    sim['trigrams'] = {}
    sim['cosine'] = {}
    sim['words_in_common'] = {}
    sim['words_in_common_ratio'] = {}
    for item in col.find():
        verbose.index(idx)
        idx+=1
        if item.has_key('similarity'):
            if item.has_key('valid'):
                if item['valid'] == True:
                    sim['levenshtein'][item['_id']] = item['similarity']['levenshtein']
                    sim['levenshtein_ratio'][item['_id']] = item['similarity']['levenshtein_ratio']
                    sim['jaro'][item['_id']] = item['similarity']['jaro']
                    sim['jaro_winkler'][item['_id']] = item['similarity']['jaro_winkler']
                    sim['unigrams'][item['_id']] = item['similarity']['unigrams']
                    sim['bigrams'][item['_id']] = item['similarity']['bigrams']
                    sim['trigrams'][item['_id']] = item['similarity']['trigrams']
                    sim['cosine'][item['_id']] = item['similarity']['trigrams']
                    sim['words_in_common'][item['_id']] = item['similarity']['words_in_common']
                    sim['words_in_common_ratio'][item['_id']] = item['similarity']['words_in_common_ratio']
    measures = ['levenshtein','levenshtein_ratio','jaro','jaro_winkler','unigrams',
              'bigrams','trigrams','cosine','words_in_common','words_in_common_ratio']
    lines = []
    for s in measures:
        fig = plt.figure(figsize=(30,10))
        plt.xlabel(s)
        plt.ylabel("similarity")
        plt.plot(sim[s].values())            
        plt.savefig(s+".png")
        plt.close()
        
    for s in measures:
        fig = plt.figure(figsize=(20,10))
        plt.xlabel(s)
        plt.hist(sim[s].values(),100,facecolor='green')            
        plt.savefig(s+"-hist.png")
        plt.close()
def plot_duplicated_names():
    dup = {}
    index = 0
    for item in mongo.db["aminer_linkedin_1123"].find():
        verbose.index(index)
        index += 1
        count = 0
        if item.has_key("aminer"):
            count += len(item["aminer"])
        if item.has_key("linkedin"):
            count += len(item["linkedin"])
        dup[item["name"]] = count

    dup_frq = {}
    for i in dup:
        if not dup_frq.has_key(dup[i]):
            dup_frq[dup[i]] = 0
        dup_frq[dup[i]] += 1
def plot_duplicated_names():
    dup = {}
    index = 0
    for item in mongo.db['aminer_linkedin_1123'].find():
        verbose.index(index)
        index+=1
        count = 0
        if item.has_key('aminer'):
            count+=len(item['aminer'])
        if item.has_key('linkedin'):
            count+=len(item['linkedin'])
        dup[item['name']]=count
    
    dup_frq = {}
    for i in dup:
        if not dup_frq.has_key(dup[i]):
            dup_frq[dup[i]] = 0
        dup_frq[dup[i]]+=1
def check_if_labeled_data_exist():
    col = mongo.db["labeled_data"]
    col61 = mongo.db['temp_person_profiles']
    not_in_db = []
    ids = []
    index = 0
    count = 0
    for item in col.find():
        verbose.index(index, 1000)
        index+=1
        query = col61.find({"_id":item['linkedin']})
        if query.count() == 0:
            url_q = col61.find({'url':item['url']})
            if url_q.count() == 0:
                verbose.debug("not in")
                verbose.debug(item['linkedin'])
                not_in_db.append(item['url'])
    out = codecs.open('urls','w','utf-8')
    for u in not_in_db:
        out.write(u+'\n')
    out.close()
def get_profile_similarity():
    col = mongo.db["labeled_data"]
    idx = 0
    for item in col.find():
        verbose.index(idx)
        idx += 1
        try:
            aminer = item["aminer_profile_str"]
            linkedin = item["linkedin_profile_str"]
            print aminer
            print linkedin
            aminer, linkedin = levenshtein.string_transform(aminer, linkedin)
            print aminer
            print linkedin
            sim = {}
            sim["levenshtein"] = levenshtein.distance_levenshtein_distance(aminer, linkedin)
            sim["levenshtein_ratio"] = levenshtein.distance_levenshtein_ratio(aminer, linkedin)
            sim["jaro"] = levenshtein.distance_jaro(aminer, linkedin)
            sim["jaro_winkler"] = levenshtein.distance_levenshtein_jaro_winkler(aminer, linkedin)
            sim["unigrams"] = levenshtein.distance_unigrams_same(aminer, linkedin)
            sim["bigrams"] = levenshtein.distance_bigrams_same(aminer, linkedin)
            sim["trigrams"] = levenshtein.distance_trigrams_same(aminer, linkedin)
            sim["cosine"] = levenshtein.distance_cosine_measure(aminer, linkedin)
            sim["words_in_common_number"], sim["words_in_common"] = levenshtein.number_of_words_in_common(
                aminer, linkedin
            )
            sim["words_in_common_ratio"] = float(sim["words_in_common"]) / (len(aminer) + len(linkedin))
            sim["valid"] = True
            if aminer == "" or linkedin == "":
                sim["valid"] = False
            item["similarity"] = sim

            col.save(item)
        except Exception, e:
            print e
            print item["_id"]
Beispiel #14
0
     index+=1
     ids.append(int(i))
     profile = ""
     try:
         profile = mysql.get_person_aminer_profile(i)
     except Exception,e:
         print e
         try:
             print i
         except Exception,e:
             print e
     profiles.append(UnicodeDammit(profile).markup)
     type.append(0)
 index=0
 for i in linkedin.nodes():
     verbose.index(index)
     index+=1
     ids.append(i)
     profile = ""
     try:
         profile = mongo.get_person_linkedin_profile(i)
     except Exception,e:
         print e
         try:
             print i
         except Exception,e:
             print e
     profiles.append(profile)
     type.append(1)
 vectorizer = CountVectorizer(stop_words='english')
 transformer = TfidfTransformer()