def check_users_with_top1000_repo(users, repos=None): users = find_all('users') if repos is None: top1000_repo_count = { repo['name']: 0 for repo in find_all('top1000_repos') } repos_count = top1000_repo_count else: repos_count = {repo: 0 for repo in repos} for i, user in enumerate(users): print(f"{i}/{len(users)} checking user {user['name']}") if len(user) == 1: # ignore users that not crawl repo yet print("not crwal yet, ignore") continue for repo in user['repos']: if repo in repos_count: repos_count[repo] += 1 repos_count = { k: v for k, v in sorted( repos_count.items(), key=lambda item: item[1], reverse=True) } for repo in repos_count: print(f"{repo}: {repos_count[repo]}") #print(repos_count) return repos_count
def predict(username): ''' return top 10 predict repos to user ''' repos = get_user_starred_repo(username) positive_set, negative_set = DirectParseData(repo2idx(repos), 838) predict_repos = idx2repo( GetTopKRecommend( path='./predict/weight/model_MAP_0.10040_np_20_epoch_32_d_64', all_u=1168, all_i=838, dim=64, epoch=256, ratio=20, top_k=10, positive_set=positive_set, negative_set=negative_set)) predict_detail = find_all(col_name='top1000_repos_detail', target={"full_name": { "$in": predict_repos }}, field_filter={ '_id': 0, 'full_name': 1, 'description': 1, 'language': 1, 'stargazers_count': 1 }) return predict_detail
def concurrent_crawl_repo_of_users(max_workers, mode='token'): users = find_all('users') users = [user for user in users if len(user) == 1] print(f'crawl user num: {len(users)}......') with ThreadPoolExecutor(max_workers=max_workers) as executor: for i, user in enumerate(users): #print(f'crawling {i+1}/{len(users)}...') executor.submit(insert_user_starred_repo, user, mode)
def crawl_top_users(): users = find_all('users') users = [user for user in users if len(user) == 1] for i, user in enumerate(users): print(f'crawling {i+1}/{len(users)}...') repos = get_user_starred_repo(user['name']) print(f"finish {user['name']} total: {len(repos)} update to db") update_one('users', {'name': user['name']}, {'repos': repos})
def init_users_to_db(users): #datas = [{'name': user} for user in users] users_in_db = find_all('users', field_filter={'_id': 0, 'repos': 0}) users_in_db = [user['name'] for user in users_in_db] new_users = [user for user in users if user not in users_in_db] print( f"{len(users) - len(new_users)} users already in db, insert {len(new_users)} users" ) if len(new_users) > 0: insert_many('users', [{'name': user} for user in new_users])
def concurrent_crawl_top1000_repo_details(max_workers, mode='token'): repos = find_all('top1000_repos') exist_repos = find_all('top1000_repos_detail', field_filter={ '_id': 0, 'full_name': 1 }) exist_repo_names = [ exist_repo['full_name'].lower() for exist_repo in exist_repos ] repo_names = [ repo['name'] for repo in repos if repo['name'].lower() not in exist_repo_names ] print(len(repo_names)) #print(repo_names) #exit() with ThreadPoolExecutor(max_workers=max_workers) as executor: for i, repo_name in enumerate(repo_names): #print(f'crawling {i+1}/{len(users)}...') executor.submit(insert_repo_detail, repo_name, mode)
def check_user_star_in_repos(users, repos=None): if repos is None: top1000_repos = [repo['name'] for repo in find_all('top1000_repos')] repos = top1000_repos users = [user for user in users if len(user) == 2] for user in users: user['repos'] = [ repo for repo in user['repos'] if repo in top1000_repos ] users.sort(key=lambda item: len(item['repos']), reverse=True) for user in users: print(f"{user['name']} : {len(user['repos'])}")
def generate_dataset(user_treshold=20, repos_threshold=50): users = find_all('users') top1000_repos = [repo['name'] for repo in find_all('top1000_repos')] users = filter_users(users, top1000_repos, user_treshold) repos = filter_repos(users, top1000_repos, repos_threshold) users = filter_users(users, repos, user_treshold) # reverse user click star repos to [early:late] for user in users: user['repos'].reverse() print( f'user > {user_treshold} : {len(users)}, repos > {repos_threshold} : {len(repos)}' ) item_index = list(range(len(repos))) user_index = list(range(len(users))) item2id = {pair[0]: pair[1] for pair in zip(repos, item_index)} user2id = {pair[0]['name']: pair[1] for pair in zip(users, user_index)} dataset = [] for user in users: dataset.append({ "UserId": user2id[user['name']], "ItemId": str([item2id[repo] for repo in user['repos']])[1:-1].replace(',', '') }) df = pd.DataFrame(dataset) df.to_csv('dataset.csv', index=False) df = pd.DataFrame([{ 'repo': pair[0], 'idx': pair[1] } for pair in zip(repos, item_index)]) df.to_csv('repo2idx.csv', index=False) df = pd.DataFrame([{ 'user': pair[0]['name'], 'idx': pair[1] } for pair in zip(users, user_index)]) df.to_csv('user2idx.csv', index=False)
def random_get_repos(n=10): df = pd.read_csv('repo2idx.csv') items = list(df['repo']) return find_all(col_name='top1000_repos_detail', target={"full_name": { "$in": sample(items, n) }}, field_filter={ '_id': 0, 'full_name': 1, 'description': 1, 'language': 1, 'stargazers_count': 1 })
def k_means(): dataList = list(database.find_all()) vectors = [] uuids = [] for data in dataList: counter = Counter() uuids.append(data['uuid']) for event in data['events']: counter[event['name']] += 1 vector = [] for typ in counter: vector.append(counter[typ]) vectors.append(vector) result = vectors labels, error, nfound = Pycluster.kcluster(vectors, 3) classes = [] for label in labels: classes.append(numpy.asscalar(label)) result = dict(zip(uuids,classes)) return result
def main(): # Call AI module to get prediction of BP1, BP2, Oxy Pulse p = predictor() p.train() predictedData = p.predict(5000) # get new data of BP1, BP2, Oxy, Pulse from DB # latestData = database.find() # call Display module to get format the html #parse data into graphable format timeData = [] pulseData = [] bpSysData = [] bpDiaData = [] oxygenData = [] for i in database.find_all(): p.appendData(i) x_val = i['createAt'] pulse_val = i['pulse'] bp_sys_val = i['bloodPreSys'] bp_dia_val = i['bloodPreDia'] oxy_val = i['bloodOx'] timeData.append(x_val) pulseData.append(pulse_val ) bpSysData.append(bp_sys_val) bpDiaData.append(bp_dia_val) oxygenData.append(oxy_val) p.train() predictedData = p.predict(5000) #5000 is time in the future alert = {'bloodPressureSys': globals.bp_sys_flag, 'bloodPressureDia': globals.bp_dia_flag,'pulse': globals.pulse_flag, 'oxygenContent': globals.oxygen_flag} data = {'time': timeData,'pulse': pulseData, 'bloodPressureSys': bpSysData, 'bloodPressureDia': bpDiaData,'oxygenContent': oxygenData} AIdata = {'AIpulse': predictedData[0], 'AIbloodPressure': predictedData[1], 'AIoxygenContent': predictedData[2]} return render_template('display.html', title='Display', alert=alert, data=data, AIdata=AIdata)
def get(self): dataList = list(database.find_all()) return json_dump(dataList)
def get(self): rows = find_all() response = HTTPResponse() response.write(rows) return response
def __handle_get_all(reply): reply["type"] = "SUCCESS" reply["data"] = database.find_all()
def on_get(self, req, resp): resp.status = falcon.HTTP_200 rows = find_all() resp.body = rows
def find(): rows = find_all() resp = Response(None, status=200, mimetype='application/json') resp.data = rows return resp