Beispiel #1
0
def split_linkedin_dump():
    skip = 2100000
    count = 0
    log = codecs.open("C:\\data\\log"+str(skip)+".txt",'w', encoding="utf-8") 
    id_map = codecs.open("C:\\data\\idmap"+str(skip)+".txt",'w', encoding="utf-8") 
    linkedin_dump = codecs.open('D:\\result.csv', encoding="utf-8")
    out = ""
    linkedin_dump.next()
    for line in linkedin_dump:
        x = 0
        if count < skip:
            count+=1
            if count % 10000 == 0:
                print count
            continue
        print str(count)+':'+str(len(line))
        log.write(str(count)+' '+str(len(line)))
        if line[0] == '"':
            x = line.find('",')
            log.write(str(count)+' '+line[1:x]+'\n')
            verbose.debug(str(count)+' '+line[1:x])
            id_map.write(str(count)+' '+line[1:x]+'\n')
            count+=1
            try:
                out = codecs.open("C:\\data\\linkedin\\"+line[1:x].strip().replace('"'," ").split('?')[0],'w', encoding="utf-8")
            except Exception, e:
                print e
        else:
            log.write("[EXCEPTION]"+str(count)+":"+line+'\n')
        out.write(line[x:])
def plot_hindex():
    import numpy as np
    import matplotlib.mlab as mlab
    import matplotlib.pyplot as plt

    dump = open("E:\\My Projects\\Eclipse Workspace\\CrossLinking\\src\\preprocess\\lenin_data")
    data = pickle.load(dump)
    ranks = []
    verbose.debug("data loaded")
    for d in data:
        ranks.append(d["rank"])

    sranks = sorted(ranks)
    mu, sigma = 100, 15
    # x = mu + sigma*np.random.randn(10000)
    x = range(0, len(sranks))

    # the histogram of the data
    n, bins, patches = plt.hist(x, 50, normed=1, facecolor="green", alpha=0.75)

    # add a 'best fit' line
    y = sranks  # mlab.normpdf( bins, mu, sigma)
    l = plt.plot(bins, y, "r--", linewidth=1)

    plt.xlabel("Smarts")
    plt.ylabel("Probability")
    plt.title(r"$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$")
    plt.axis([40, 160, 0, 0.03])
    plt.grid(True)

    verbose.debug("show")
    plt.show()
def check_werid_url():
    col = mongo.db["lenin_label_data"]
    aid = []
    for item in col.find():
        if 'view' in item['linkedin']:
            if item['rank']<40000 and item['flag']=='1':
                verbose.debug(item['aminer'])
                aid.append(item['aminer'])
def check_alsoview_in_db():
    index = 0
    count = 0
    for item in mongo.person_profiles.find(limit=10):
        verbose.index(index)
        index+=1
        if item.has_key('also_view'):
            for alsoview in item['also_view']:
                x = mongo.person_profiles.find({"_id":alsoview['url']})
                if x.count()>0:
                    alsoview['crawled']=True
                    verbose.debug(alsoview['url'])
                    verbose.debug(count)
                else:
                    alsoview['crawled']=False
            mongo.person_profiles.save(item)
def check_if_labeled_data_exist():
    col = mongo.db["labeled_data"]
    col61 = mongo.db['temp_person_profiles']
    not_in_db = []
    ids = []
    index = 0
    count = 0
    for item in col.find():
        verbose.index(index, 1000)
        index+=1
        query = col61.find({"_id":item['linkedin']})
        if query.count() == 0:
            url_q = col61.find({'url':item['url']})
            if url_q.count() == 0:
                verbose.debug("not in")
                verbose.debug(item['linkedin'])
                not_in_db.append(item['url'])
    out = codecs.open('urls','w','utf-8')
    for u in not_in_db:
        out.write(u+'\n')
    out.close()
def plot_labeled_linkedin_degree():
    col = mongo.db["temp_person_profiles"]
    degree = []
    missed = []
    for item in col.find():
        if item.has_key("also_view"):
            deg = 0
            for i in item["also_view"]:
                x = mongo.db["temp_person_profiles"].find({"_id": i["url"]})
                if x.count() > 0:
                    deg += 1
                else:
                    x = mongo.db["temp_alsoview_person_profiles"].find({"_id": i["url"]})
                    if x.count() > 0:
                        deg += 1
                        missed.append(i["linkedin_id"])
                        verbose.debug(item["_id"])
            degree.append(deg)
        else:
            degree.append(0)
    plt.hist(degree, 10, facecolor="green")
    plt.show()
def plot_labeled_linkedin_degree():
    col = mongo.db['temp_person_profiles']
    degree = []
    missed = []
    for item in col.find():
        if item.has_key('also_view'):
            deg = 0
            for i in item['also_view']:
                x = mongo.db['temp_person_profiles'].find({'_id':i['url']})
                if x.count()>0:
                    deg +=1
                else:
                    x = mongo.db['temp_alsoview_person_profiles'].find({'_id':i['url']})
                    if x.count()>0:
                        deg +=1
                        missed.append(i['linkedin_id'])
                        verbose.debug(item['_id'])
            degree.append(deg)
        else:
            degree.append(0)
    plt.hist(degree,10, facecolor='green')
    plt.show()
def compare():
    col = mongo.db["aminer_linkedin_1123"]
    aid = []
    lid = []
    index = 0
    for item in col.find():
        if index % 1000 == 0:
            print index
        index+=1
        if not item.has_key('aminer'):
            verbose.debug('aminer '+item['name'])
        else:
            for a in item['aminer']:
                aid.append(a['id'])
        if not item.has_key('linkedin'):
            verbose.debug('linkedin '+item['name'])
        else:
            for l in item['linkedin']:
                lid.append(l['url'])
    dump_aid = open('aminer_linkedin_1123_aminer_id','w')
    pickle.dump(aid,dump_aid)
    dump_lid = open('aminer_linkedin_1123_linkedin_url','w')
    pickle.dump(lid,dump_lid)
    col = mongo.db['aminer_linkedin_labeled_1124']
    labeled_aid = []
    labeled_lid = []
    labeled_data = col.find({'$or':[{'labels.homepage_match':True},
                                    {'labels.domain_match':True},
                                    {'labels.aff_match':True}]})
    index = 0
    print "start"
    for item in labeled_data:
        if index % 1000 == 0:
            print index
        index+=1
        verbose.debug(item['name']+' '+str(item['_id']))
        labeled_aid.append(item['aminer']['id'])
        labeled_lid.append(item['linkedin']['url'])
        
    x = []
    for a in labeled_aid:
        x.append(str(a))
        
    dump = open("E:\\My Projects\\Eclipse Workspace\\CrossLinking\\src\\preprocess\\lenin_data")
    data = pickle.load(dump)
    xaid= []
    xlid = []
    for person in data:
        xaid.append(person['aminer'])
        xlid.append(person['linkedin'])