def __init__(self): print 'Initializing Recommender..' directory_name = rm.CACHE self.data_retriever = DataRetriever(directory_name) self.project_data = self.data_retriever.parseProjectData() self.user_data, self.user_follower_map = self.data_retriever.parseUserFollowers() self.language_proj = defaultdict()
class Recommender(): """Initialize the recommender""" def __init__(self): print 'Initializing Recommender..' directory_name = rm.CACHE self.data_retriever = DataRetriever(directory_name) self.project_data = self.data_retriever.parseProjectData() self.user_data, self.user_follower_map = self.data_retriever.parseUserFollowers() self.language_proj = defaultdict() def get_languages(self): lang_dict = {} for lang in self.language_proj.keys(): _lang = lang.replace(' ','$') lang_dict[_lang] = lang return lang_dict def get_aoi(self): return self.categories """Get different scores for each project""" def build_project_features(self): try: with open(rm.NB_PROB, 'rb') as f: print "Reading probabilities from:", rm.NB_PROB self.project_vector = pickle.load(f) self.categories = pickle.load(f) print 'done.' print '#Projects:', len(self.project_vector) print '#Categories:', len(self.categories) except: print "Generating a new Naive Base classifier" self.project_vector_builder = ProjectVectorBuilder(self.project_data) self.project_vector = self.project_vector_builder.build_projects_vector() self.categories = list(self.project_vector_builder.nb.clf.classes_) with open(rm.NB_PROB, 'wb') as f: pickle.dump(self.project_vector, f) with open(rm.NB_PROB, 'ab') as f: pickle.dump(self.categories, f) self.user_ranking = pagerank(self.user_data) with open(os.path.join(rm.CACHE, 'lang_to_projects.p'), 'rb') as f: self.language_proj = pickle.load(f) with open(os.path.join(rm.CACHE, 'new_LOC.p'),'rb') as f: self.difficulty_score = pickle.load(f) def recommend_projects(self, languages, area_interest, difficulty): print "Calling recommender" projects = set() #Filter based on languages for language in languages: projects = projects.union(self.language_proj[language]) similar_projects = [] for project in projects: if project not in self.project_vector: continue if self.project_vector[project]['category'] in area_interest: project_desc = self.project_vector[project] project_desc['html_url'] = self.project_data[project]['html_url'] project_desc['full_name'] = self.project_data[project]['full_name'] similar_projects.append(project_desc) sorted_similar_projects = sorted(similar_projects, key=lambda k: k['prob'], reverse=True) #pp.pprint(sorted_similar_projects) zipped = map(list, zip(*self.user_ranking)) userLists = zipped[0] PRs = zipped[1] sortedProjsLength = len(sorted_similar_projects) for i in range(0,len(sorted_similar_projects)): proj = sorted_similar_projects[i] project = self.project_data[proj[u'full_name']] owner = project[u'owner'] if owner[u'login'] in userLists: userIndex = userLists.index(owner[u'login']) sorted_similar_projects[i]['page_rank_of_owner'] = PRs[userIndex] sorted_similar_projects[i]['owner'] = owner[u'login'] #sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login'] if len(self.project_data[proj['full_name']]['contributors']) >=1: sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login'] sorted_similar_projects[i]['contributors_url'] = self.project_data[proj['full_name']]['contributors'][0]['html_url'] else: sorted_similar_projects[i]['contributors'] = '' sorted_similar_projects[i]['contributors_url'] ='' else: sorted_similar_projects[i]['page_rank_of_owner'] = 0 sorted_similar_projects[i]['owner'] = owner[u'login'] if len(self.project_data[proj['full_name']]['contributors']) >=1: sorted_similar_projects[i]['contributors'] = self.project_data[proj['full_name']]['contributors'][0]['login'] sorted_similar_projects[i]['contributors_url'] = self.project_data[proj['full_name']]['contributors'][0]['html_url'] else: sorted_similar_projects[i]['contributors'] = '' sorted_similar_projects[i]['contributors_url'] = '' # sort the sorted_similar_projects based on the key 'page_rank_of_owner' value # have the contributors tag with the first contributor for the server side handling #""" if len(sorted_similar_projects) > 10: firstListToSort = sorted_similar_projects[0:sortedProjsLength/2] secListToSort = sorted_similar_projects[sortedProjsLength/2 + 1 : sortedProjsLength*4/5 ] thirListToSort = sorted_similar_projects[sortedProjsLength*4/5 + 1 : ] #""" sorted1 = sorted(firstListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) sorted2 = sorted(secListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) sorted3 = sorted(thirListToSort, key=lambda k: k['page_rank_of_owner'], reverse=True) #print len(sorted1) #print len(sorted2) #print len(sorted3) sorted1.extend(sorted2) sorted3.extend(sorted1) #pp.pprint(sorted3) #print 'lenght after merging all: ',len(sorted3) return sorted3 return sorted_similar_projects