def main(): ''' Here I'm looping through the verious credential files and retrieve results for each credentail ''' cred = ['amanuel'] connections_dict = dict() total_profiles_list = [] for name in cred: credential_filename = "credentials_" + name + ".json" application = authenticate(credential_filename) # Search the Results profile_results = search(application, name, "Mathematician") # Append the results to the total_profiles total_profiles_list.append(profile_results) # Get the connection # ds_connections = retrieve_connections(application, name) # Add to the dictionary # connections_dict[name] = ds_connections # Save all the profiels retrieved total_out_file = "./data/total_profiles_math" + month + day + year + ".pkl" utils.savepickle(total_profiles_list, total_out_file)
def main(): ''' Here I'm looping through the verious credential files and retrieve results for each credentail ''' cred = [ 'amanuel'] connections_dict = dict() total_profiles_list =[] for name in cred: credential_filename = "credentials_"+name+".json" application= authenticate(credential_filename) # Search the Results profile_results = search(application, name, "Mathematician") # Append the results to the total_profiles total_profiles_list.append(profile_results) # Get the connection # ds_connections = retrieve_connections(application, name) # Add to the dictionary # connections_dict[name] = ds_connections # Save all the profiels retrieved total_out_file = "./data/total_profiles_math"+month+day+year+".pkl" utils.savepickle(total_profiles_list, total_out_file)
def build_skills_ds(db, collection): ''' Builds a matrix with all the skills per profile''' print "Building skills Df" # The method below fails when it # comes to name the columns if it's nicer # skills_list= collection.distinct('skills') # columns = [] # for i in range(len(skills_list)): # print skills_list[i] # columns.append(skills_list[i]) # pdb.set_trace() skill_set = set() skill_full_list = [] index= [] # Build columns and index to create the dataframe cursor = collection.find({}, {"_id":0, "skills":1, "id":1}) for results in cursor: # pdb.set_trace() if "skills" in results: for skill in (results['skills']): skill_set.add(skill) skill_full_list.append(skill) index.append(results['id']) print len(skill_set) df_skills = pd.DataFrame(index=index, columns=skill_set).fillna(0) # print df_skills cursor_profile = collection.find({}) # Here I will set the value of for profile in cursor_profile: if 'skills' in profile: user_id = profile['id'] skill_list = profile['skills'] # Parse the skill list: for skill in skill_list: df_skills.ix[user_id, skill] = 1 # Save the matrix in a pickle date_string = utils.get_date_string() out_file_matrix = './results/skills_matrix_'+date_string+'.pkl' utils.savepickle(df_skills, out_file_matrix) # pdb.set_trace() return df_skills, skill_full_list
def retrieve_connections(applicaiton, name): data_scienctist_connections = [] outfile = "./data/"+name+"_connections_list.pkl" connections = applicaiton.get_connections(selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \ 'location', 'distance', 'num-connections', 'skills',\ 'public-profile-url', 'date-of-birth', 'courses', 'specialties',\ 'educations', 'positions']) print connections.keys() # Save the file first and in case do the processing later utils.savepickle(connections, outfile) connections = utils.readpickle(outfile) for connection in connections['values']: found = False # Here I have the single value in the connection # Now I want just to return the connection if it has "data scient" if 'headline' in connection: if 'data scientist' in connection['headline'].lower(): found = True print connection['firstName'] , connection['lastName'] print "Data Scientist in Headline" if 'positions' in connection: # print connection['positions'] positions_num = connection['positions']['_total'] for i in range(int (positions_num)): position = connection['positions']['values'][i] if 'data scientist' in position['title'].lower(): found = True print connection['firstName'] , connection['lastName'] print "Data Scientist in a position" if 'summary' in connection: if 'data scientist' in connection['summary'].lower(): found = True print connection['firstName'] , connection['lastName'] print "Data Scientist in Summary" if found: data_scienctist_connections.append(connection) # Save the data scientist connections outfile_conn = "./data/"+name+"_data_science_connections"+month+day+year+".pkl" utils.savepickle(data_scienctist_connections, outfile_conn) return data_scienctist_connections
def build_skills_ds(db, collection): ''' Builds a matrix with all the skills per profile''' print "Building skills Df" # The method below fails when it # comes to name the columns if it's nicer # skills_list= collection.distinct('skills') # columns = [] # for i in range(len(skills_list)): # print skills_list[i] # columns.append(skills_list[i]) # pdb.set_trace() skill_set = set() skill_full_list = [] index = [] # Build columns and index to create the dataframe cursor = collection.find({}, {"_id": 0, "skills": 1, "id": 1}) for results in cursor: # pdb.set_trace() if "skills" in results: for skill in (results['skills']): skill_set.add(skill) skill_full_list.append(skill) index.append(results['id']) print len(skill_set) df_skills = pd.DataFrame(index=index, columns=skill_set).fillna(0) # print df_skills cursor_profile = collection.find({}) # Here I will set the value of for profile in cursor_profile: if 'skills' in profile: user_id = profile['id'] skill_list = profile['skills'] # Parse the skill list: for skill in skill_list: df_skills.ix[user_id, skill] = 1 # Save the matrix in a pickle date_string = utils.get_date_string() out_file_matrix = './results/skills_matrix_' + date_string + '.pkl' utils.savepickle(df_skills, out_file_matrix) # pdb.set_trace() return df_skills, skill_full_list
def retrieve_connections(applicaiton, name): data_scienctist_connections = [] outfile = "./data/" + name + "_connections_list.pkl" connections = applicaiton.get_connections(selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \ 'location', 'distance', 'num-connections', 'skills',\ 'public-profile-url', 'date-of-birth', 'courses', 'specialties',\ 'educations', 'positions']) print connections.keys() # Save the file first and in case do the processing later utils.savepickle(connections, outfile) connections = utils.readpickle(outfile) for connection in connections['values']: found = False # Here I have the single value in the connection # Now I want just to return the connection if it has "data scient" if 'headline' in connection: if 'data scientist' in connection['headline'].lower(): found = True print connection['firstName'], connection['lastName'] print "Data Scientist in Headline" if 'positions' in connection: # print connection['positions'] positions_num = connection['positions']['_total'] for i in range(int(positions_num)): position = connection['positions']['values'][i] if 'data scientist' in position['title'].lower(): found = True print connection['firstName'], connection['lastName'] print "Data Scientist in a position" if 'summary' in connection: if 'data scientist' in connection['summary'].lower(): found = True print connection['firstName'], connection['lastName'] print "Data Scientist in Summary" if found: data_scienctist_connections.append(connection) # Save the data scientist connections outfile_conn = "./data/" + name + "_data_science_connections" + month + day + year + ".pkl" utils.savepickle(data_scienctist_connections, outfile_conn) return data_scienctist_connections
def search(application, name, keywords): ''' Retrieves the profiles cotnaing keywords using the credetial of a user ''' log = True search_results = application.search_profile(selectors=[{'people': ['first-name', 'last-name', 'id']}], params={'keywords': keywords, 'start':0, 'count':25}) # Saves the results in pickle file if log: outfile = "./data/search_results_"+name+"_"+month+day+year+".pkl" utils.savepickle(search_results, outfile) total_people_count = int(search_results['people']['_total']) pagination = int(search_results['people']['_count']) start = int(search_results['people']['_start']) print "Found %d results" %total_people_count, pagination # Computes the number of loops to be executed if total_people_count % pagination==0: calls = total_people_count/pagination else: calls = total_people_count/pagination+1 print calls full_results=[] for i in range(calls): # Slowdown the requests # Just in case sleep(0.5) profile_list=[] count = pagination results = application.search_profile(selectors=[{'people': ['first-name', 'last-name', 'id']}], params={'keywords': keywords , 'start':start, 'count':count}) profile_list = parse_search_results(results) profile_details_list=[] for profile in profile_list: profile_id = profile['id'] if profile_id != 'private': print profile_id # TO DO: add all the possible fields profile_details = application.get_profile(member_id = profile_id, \ selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \ 'location', 'distance', 'num-connections', 'skills',\ 'public-profile-url', 'date-of-birth', 'courses', 'specialties',\ 'educations', 'positions']) print profile_details profile_details_list.append(profile_details) full_results.append(profile_details) outfile = "./data/"+name+"_profiles_"+str(start)+"_"+month+day+year+".pkl" utils.savepickle(profile_details_list, outfile) # Increase the start point start+=pagination # Save the full_list of results outfile = "./data/"+name+"_full_profile_list"+month+day+year+".pkl" utils.savepickle(full_results, outfile) return full_results
def enhance_profiles(profile_file, search_label, data_scientist_label): ''' Add the infromation missing in the profile retrieved form the API and stores the new profiles in a new list''' enhanced_profiles = [] profile_list = utils.readpickle(profile_file) for profile in profile_list: processed = False user_id = profile['id'] pub_profile_file = './data/full_profiles/' + user_id + "_profile.json" # Check if file exists # Add the additional labels information profile['search_label'] = search_label profile['label'] = data_scientist_label try: with open(pub_profile_file): pub_profile = json.load(open(pub_profile_file)) if 'educations' not in profile: print "Missing education" # Open the json file and look for education if 'education' in pub_profile: print "Found education in pub profile file" # Add education profile['educations'] = pub_profile['education'] profile['added_education'] = True else: print "Education not found in public profile" if 'skills' not in profile: print "Missing skills" # Open the json file and look for education if 'skills' in pub_profile: print "Found skills in pub profile file" # Add education profile['skills'] = pub_profile['skills'] profile['added_skills'] = True else: print "Skills not found in public profile" if 'specialties' not in profile: print "Missing specialties" # Open the json file and look for education if 'specialties' in pub_profile: print "Found specialties in pub profile file" # Add education profile['specialties'] = pub_profile['specialties'] profile['added_specialties'] = True else: print "Specialties not found in public profile" print profile except: print("Public profile file not found") # Add the profile to the new profilesle enhanced_profiles.append(profile) # Save the pickle with new profile out_file_enh_profiles = './data/enhanced_profiles/math_enchanced_total_unique_profiles_' + day + month + year + '.pkl' utils.savepickle(enhanced_profiles, out_file_enh_profiles)
# Combines several lists of profiles into one # Lists just keeping the uniqe user_id cred = ['mine', 'motoki', 'henry'] total_unique_profiles = [] unique_users = set() total_profiles = 0 for name in cred: profile_filename = "./data/" + name + "_full_profile_list1192013.pkl" profile_list = utils.readpickle(profile_filename) for profile in profile_list: total_profiles += 1 user_id = profile['id'] firstName = profile['firstName'] lastName = profile['lastName'] user = (firstName, lastName) # print user_id if user not in unique_users: # Add to unqie profiles unique_users.add(user) total_unique_profiles.append(profile) else: print "user exists" print user # Save the pickle out_tot_profiles = 'data/total_unique_profile_math_list.pkl' utils.savepickle(total_unique_profiles, out_tot_profiles) print "Total Profiles: %d, Unique profiles %d, %d" % ( total_profiles, len(unique_users), len(total_unique_profiles))
def enhance_profiles(profile_file, search_label, data_scientist_label): ''' Add the infromation missing in the profile retrieved form the API and stores the new profiles in a new list''' enhanced_profiles=[] profile_list = utils.readpickle(profile_file) for profile in profile_list: processed = False user_id = profile['id'] pub_profile_file = './data/full_profiles/'+user_id+"_profile.json" # Check if file exists # Add the additional labels information profile['search_label'] = search_label profile['label'] = data_scientist_label try: with open(pub_profile_file): pub_profile = json.load(open(pub_profile_file)) if 'educations' not in profile: print "Missing education" # Open the json file and look for education if 'education' in pub_profile: print "Found education in pub profile file" # Add education profile['educations'] = pub_profile['education'] profile['added_education'] = True else: print "Education not found in public profile" if 'skills' not in profile: print "Missing skills" # Open the json file and look for education if 'skills' in pub_profile: print "Found skills in pub profile file" # Add education profile['skills'] = pub_profile['skills'] profile['added_skills'] = True else: print "Skills not found in public profile" if 'specialties' not in profile: print "Missing specialties" # Open the json file and look for education if 'specialties' in pub_profile: print "Found specialties in pub profile file" # Add education profile['specialties'] = pub_profile['specialties'] profile['added_specialties'] = True else: print "Specialties not found in public profile" print profile except: print("Public profile file not found") # Add the profile to the new profilesle enhanced_profiles.append(profile) # Save the pickle with new profile out_file_enh_profiles = './data/enhanced_profiles/math_enchanced_total_unique_profiles_'+day+month+year+'.pkl' utils.savepickle(enhanced_profiles, out_file_enh_profiles)
def search(application, name, keywords): ''' Retrieves the profiles cotnaing keywords using the credetial of a user ''' log = True search_results = application.search_profile(selectors=[{ 'people': ['first-name', 'last-name', 'id'] }], params={ 'keywords': keywords, 'start': 0, 'count': 25 }) # Saves the results in pickle file if log: outfile = "./data/search_results_" + name + "_" + month + day + year + ".pkl" utils.savepickle(search_results, outfile) total_people_count = int(search_results['people']['_total']) pagination = int(search_results['people']['_count']) start = int(search_results['people']['_start']) print "Found %d results" % total_people_count, pagination # Computes the number of loops to be executed if total_people_count % pagination == 0: calls = total_people_count / pagination else: calls = total_people_count / pagination + 1 print calls full_results = [] for i in range(calls): # Slowdown the requests # Just in case sleep(0.5) profile_list = [] count = pagination results = application.search_profile(selectors=[{ 'people': ['first-name', 'last-name', 'id'] }], params={ 'keywords': keywords, 'start': start, 'count': count }) profile_list = parse_search_results(results) profile_details_list = [] for profile in profile_list: profile_id = profile['id'] if profile_id != 'private': print profile_id # TO DO: add all the possible fields profile_details = application.get_profile(member_id = profile_id, \ selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \ 'location', 'distance', 'num-connections', 'skills',\ 'public-profile-url', 'date-of-birth', 'courses', 'specialties',\ 'educations', 'positions']) print profile_details profile_details_list.append(profile_details) full_results.append(profile_details) outfile = "./data/" + name + "_profiles_" + str( start) + "_" + month + day + year + ".pkl" utils.savepickle(profile_details_list, outfile) # Increase the start point start += pagination # Save the full_list of results outfile = "./data/" + name + "_full_profile_list" + month + day + year + ".pkl" utils.savepickle(full_results, outfile) return full_results
# Combines several lists of profiles into one # Lists just keeping the uniqe user_id cred = ['mine', 'motoki', 'henry'] total_unique_profiles = [] unique_users = set() total_profiles = 0; for name in cred: profile_filename = "./data/"+name+"_full_profile_list1192013.pkl" profile_list = utils.readpickle(profile_filename) for profile in profile_list: total_profiles+=1 user_id = profile['id'] firstName = profile['firstName'] lastName = profile['lastName'] user = (firstName, lastName) # print user_id if user not in unique_users: # Add to unqie profiles unique_users.add(user) total_unique_profiles.append(profile) else: print "user exists" print user # Save the pickle out_tot_profiles = 'data/total_unique_profile_math_list.pkl' utils.savepickle(total_unique_profiles, out_tot_profiles) print "Total Profiles: %d, Unique profiles %d, %d" %(total_profiles, len(unique_users) , len(total_unique_profiles))