def getDeveloper_Contribute(data): try: # 根据/stats/contributors 获取前99comiit贡献者 contributor_repo = github.models.Repo_Base_Info.objects.filter(repo_id=data['id']) statistic_contributors_url = data['url'] + "/stats/contributors" statistic_contributors_html = requests.get(statistic_contributors_url, headers=headers).text statistic_contributors_info = json.loads(statistic_contributors_html) if (statistic_contributors_info): for i in range(len(statistic_contributors_info)): # 遍历获取statistic_contributors信息 statistic_contributors_user_id = statistic_contributors_info[i]['author']['id'] contributor_user = github.models.User_Org_Info.objects.filter(user_id=statistic_contributors_user_id) add_perweek = {} del_perweek = {} commit_perweek = {} statistic_contributors_perweek = statistic_contributors_info[i]['weeks'] for j in range(len(statistic_contributors_perweek)): # 遍历获取statistic_contributors_info['weeks']信息 if isinstance(statistic_contributors_perweek[j], dict): # 判断是否是字典类型isinstance 返回True false key = statistic_contributors_perweek[j]['w'] add_perweek[key] = statistic_contributors_perweek[j]['a'] del_perweek[key] = statistic_contributors_perweek[j]['d'] commit_perweek[key] = statistic_contributors_perweek[j]['c'] add_perweek = json.dumps(add_perweek, ensure_ascii=False, encoding='UTF-8') del_perweek = json.dumps(del_perweek, ensure_ascii=False, encoding='UTF-8') commit_perweek = json.dumps(commit_perweek, ensure_ascii=False, encoding='UTF-8') github.models.Repo_Developer_info.objects.filter(Q(repo=contributor_repo) & Q(user=contributor_user)) \ .update(user_commit_count_perweek=commit_perweek, user_add_count_perweek=add_perweek, user_del_count_perweek=del_perweek) # 判断用户或组织是否存在 except BaseException as e: log.set_log(e)
def parse(self, html): for i in range(len(html)): # 遍历获取contributor信息 try: v = html[i] #github里把pull_request看作一次issue,在这里需要排除 if (v.has_key('pull_request')): continue issue_id = v['id'] issue_number = v['number'] issue_state = v['state'] if (issue_state == 'open'): issue_state = 0 elif (issue_state == 'closed'): issue_state = 1 else: issue_state = 2 issue_create_time = v['created_at'] issue_update_time = v['updated_at'] issue_close_time = v['closed_at'] issue_comment_count = v['comments'] issue_user_type = v['author_association'] issue_user_id = v['user']['id'] user = github.models.User_Org_Info.objects.filter( user_id=issue_user_id) if (len(user) < 1): downloadBaseInfo.getUserInfo(v['user']) user = github.models.User_Org_Info.objects.filter( user_id=issue_user_id) repo = github.models.Repo_Base_Info.objects.filter( repo_id=self.data['id']) try: issues = github.models.Repo_Issue_info.objects.filter( Q(issue_id=issue_id) & Q(issue_number=issue_number)) # 判断用户或组织是否存在 if (issues and len(issues) > 0): issues[0].issue_state = issue_state issues[0].issue_update_time = issue_update_time issues[0].issue_close_time = issue_close_time issues[0].issue_comment_count = issue_comment_count issues[0].save() else: issues_new = github.models.Repo_Issue_info.objects.create( repo=repo[0], user=user[0], issue_id=issue_id, issue_number=issue_number, issue_create_time=issue_create_time, issue_update_time=issue_update_time, issue_close_time=issue_close_time, issue_comment_count=issue_comment_count, issue_user_type=issue_user_type, issue_state=issue_state) issues_new.save() except BaseException as ex: pass except BaseException as e: log.set_log(e)
def run(self): try: while not CRAWL_EXIT: page = self.pageQueue.get(False) pulls_url = self.data["pulls_url"][0:-9]+"?state=all"+"&per_page=100&page=" + str(page) pulls_html = requests.get(pulls_url, headers=headers).text pulls_info = json.loads(pulls_html) #把爬下来的数据放到数据队列中 if (pulls_info): self.dataQueue.put(pulls_info) except BaseException as e: log.set_log(e)
def run(self): try: repo_id = self.data['id'] global commit_total_count #repo_owner = github.models.Repo_Base_Info.objects.get(repo_id=repo_id) #获取repo_id #先查询contributor,找到所有contributor,然后遍历,获取一些数据,然后在通过commit?auth= 查询具体每次commit #contributor_url = self.data['contributors_url']+"?anon=1" #flag = True while not CRAWL_EXIT: page = self.pageQueue.get(False) contributor_url = self.data['contributors_url'] + "?anon=1&per_page=100&page=" + str(page) contributor_html = requests.get(contributor_url, headers=headers).text contributor_info = json.loads(contributor_html) if (contributor_info): for i in range(len(contributor_info)): # 遍历获取contributor信息 #time.sleep(1) v = contributor_info[i] contributor_contributions = v['contributions'] #如果是用户 if(contributor_info[i].has_key('id')): contributor_id = v['id'] contributor_name = v['login'] contributor_avatar_url = v['avatar_url'] contributor_type = v['type'] try: contributor_user = github.models.User_Org_Info.objects.get(user_id=contributor_id) # 获取repo_id except github.models.User_Org_Info.DoesNotExist: user_new = github.models.User_Org_Info.objects.create(user_id=contributor_id, user_name=contributor_name, avatar_url=contributor_avatar_url, user_type=contributor_type) user_new.save() contributor_commit_user = contributor_name #如果是匿名 else: contributor_email = v['email'] contributor_fullname = v['name'] contributor_type = v['type'] try: contributor_user = github.models.User_Org_Info.objects.get(email_url=contributor_email) # 获取repo_id contributor_id = contributor_user.user_id except github.models.User_Org_Info.DoesNotExist: random_id = ''.join(str(random.choice(range(10))) for _ in range(5)) #获取随机数 t = time.time() #获取时间戳 time_stamp =str((int(round(t * 1000)))) time_stamp = time_stamp[-9:] contributor_id = '9'+random_id+time_stamp user_new = github.models.User_Org_Info.objects.create(user_id = contributor_id, email_url=contributor_email, user_fullname=contributor_fullname, user_type=contributor_type) user_new.save() contributor_commit_user = contributor_email #先查找表里是否存在,如存在且commit数量相同,则跳过 contributor_user = github.models.User_Org_Info.objects.filter(user_id=contributor_id) contributor_repo = github.models.Repo_Base_Info.objects.filter(repo_id=repo_id) developer = github.models.Repo_Developer_info.objects.filter( Q(repo=contributor_repo) & Q(user=contributor_user)) # 判断用户或组织是否存在 #if(developer and developer[0].user_commit_count == contributor_contributions): # continue #读取commit的第一页和最后一页,主要读取第一次commit时间和最后一次commit时间 commit_user_url_first = self.data["commits_url"][0:-6] + "?author=" + contributor_commit_user commit_user_req = urllib2.Request(commit_user_url_first, headers=headers) commit_user_res = urllib2.urlopen(commit_user_req) commit_user_html = commit_user_res.read() commit_user_info = json.loads(commit_user_html) if(commit_user_info and len(commit_user_info)>0 ): if(commit_user_info[0].has_key('commit')): user_last_update_time = commit_user_info[0]['commit']['author']['date'] #if (developer): # user_creat_time = developer[0].user_creat_time #else: #计算commit页数,按照每页30计算 commit_page = contributor_contributions/30 commit_page_yu = contributor_contributions%30 if(commit_page_yu!=0): commit_page = commit_page+1 if(commit_page==0):#如果只有一页 if (commit_user_info[len(commit_user_info)-1].has_key('commit')): user_creat_time = commit_user_info[len(commit_user_info)-1]['commit']['author']['date'] lock.acquire() commit_total_count += len(commit_user_info) lock.release() else: commit_user_url_last = self.data["commits_url"][0:-6] + "?author=" + contributor_commit_user + "&page=" + str(commit_page) commit_user_last_req = urllib2.Request(commit_user_url_last, headers=headers) commit_user_last_res = urllib2.urlopen(commit_user_last_req) commit_user_last_html = commit_user_last_res.read() commit_user_last_info = json.loads(commit_user_last_html) if (commit_user_last_info and len(commit_user_last_info) > 0): if (commit_user_last_info[len(commit_user_last_info) - 1].has_key('commit')): user_creat_time = commit_user_last_info[len(commit_user_last_info) - 1]['commit']['author']['date'] lock.acquire() commit_total_count +=(commit_page-1)*30 + len(commit_user_last_info) lock.release() #插入表格中 try: developer = github.models.Repo_Developer_info.objects.get( Q(repo = contributor_repo) & Q(user=contributor_user)) # 判断用户或组织是否存在 developer.user_creat_time = user_creat_time developer.user_last_update_time = user_last_update_time developer.user_commit_count = contributor_contributions developer.save() except github.models.Repo_Developer_info.DoesNotExist: developer_new = github.models.Repo_Developer_info.objects.create(repo=contributor_repo[0], user=contributor_user[0], user_creat_time=user_creat_time, user_last_update_time=user_last_update_time, user_commit_count=contributor_contributions) developer_new.save() #循环结束 #线程1获取commit数量 if(threading.current_thread().getName()=='tr2'): issue_url = self.data["issues_url"][0:-9]+"?filter=all" except BaseException as e: log.set_log(e)
pre = '<html><body>\n<h1>Yorg</h1>\n' post = '\n</body>\n</html>' def bld_page(page): return pre + page + post emptypage = bld_page(emptypage) activationpage = bld_page(activationpage) resetpage = bld_page(resetpage) resetpage_ok = bld_page(resetpage_ok) resetpage_ko = bld_page(resetpage_ko) set_log('yorg_server_web') class RequestHandler(BaseHTTPRequestHandler): def __init__(self, request, client_address, server): self.db = DBFrontend('yorg') BaseHTTPRequestHandler.__init__(self, request, client_address, server) def do_GET(self): parsed_path = urlparse(self.path) page = self.bld_page(parsed_path.path, parsed_path.query) if not page: self.send_error(404) return self.send_response(200) self.send_header('Content-Type', 'text/html')
from log import set_log, Logger if __name__ == '__main__': set_log(Logger()) # TODO main.py # TODO run project # TODO MQTT Subscriber