def run(self): # get db connection db = connectMysqlDB(config, autocommit=False) cur = db.cursor() while True: try: print "" work = self.q.get(timeout=0) print "the number of work in queue: " + str(self.q.qsize()) number = work["number"] owner = work["owner"] repo = work["repo"] page = 1 sum = 0 # sum of inserted db # every comments has several pages of reaction while True: # get a suitable token and combine header github_token = get_token() headers = { 'User-Agent': 'Mozilla/5.0', 'Authorization': 'token ' + github_token, 'Content-Type': 'application/json', 'method': 'GET', 'Accept': 'application/vnd.github.squirrel-girl-preview+json' } # print "headers is: " + str(headers) # combine url url = "https://api.github.com/repos/" + owner + "/" + repo + "/issues" + "/" + str( number) + "/reactions" url = url + "?page=" + str(page) print "url is: " + url try: # request data and parse response req = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(req) result = json.loads(response.read().decode("utf-8")) # print result length = len(result) sum += length if length == 0: print "finish, comment " + str( number) + " has reactions: " + str(sum) self.q.task_done() break # write file json_str = json.dumps(result) # print "json format data: " + json_str filename = base_path + "/" + owner + "&" + repo + "/" + str( number) + "/" + str(page) + ".json" flag = base.generate_file(filename, json_str) if flag is True: print "create file successfully: " + filename elif flag is False: print "file is already existed: " + filename else: print "create file failed: " + flag + " filename: " + filename continue page += 1 # page++ # handle response json data num = 0 while num < length: insert_dict = {} if "id" not in result[num]: insert_dict["id"] = None else: insert_dict["id"] = result[num]["id"] if "login" not in result[num]["user"]: insert_dict["user_login"] = None else: insert_dict["user_login"] = result[num][ "user"]["login"] if "created_at" not in result[num]: insert_dict["created_at"] = None else: insert_dict["created_at"] = result[num][ "created_at"] if "content" not in result[num]: insert_dict["content"] = None else: insert_dict["content"] = result[num]["content"] print "insert info: " + str(insert_dict) # insert data to database table try: if insert_dict is not None: cur.execute( "insert into github_reaction " "(id, number, user_login, owner_login, repo, created_at, flag, content) " "values (%s, %s, %s, %s, %s, %s, %s, %s)", (insert_dict["id"], number, insert_dict["user_login"], owner, repo, base.time_handler( insert_dict["created_at"]), 1, insert_dict["content"])) db.commit() except Exception as e: print str(e) num += 1 except Exception as e: print str(e) + " error with this page: " + url # if e.code == 403: # break # if e.code != 404: # # mainly 403, sometimes 503 # # token rate limit # self.q.put(work) # put into the queue again # sleep_time_tokens[github_token] = time.time() # set sleep time for that token # insert_dict = None # else: # insert_dict["body"] = "404 error" # insert_dict["created_at"] = None # insert_dict["updated_at"] = None else: pass # 403... error except Queue.Empty: cur.close() db.close() return except Exception as e: print str( e ) + "qiubing" # unexpected error, don't interrupt the program
def run(self): work = self.q.get(timeout=0) print "the number of work in queue: " + str(self.q.qsize()) id = work["repo_id"] owner = work["owner"] repo = work["repo"] page = 1 sum = 0 # sum of inserted db # get db connection db = connectMysqlDB(config, autocommit=False) cur = db.cursor() while True: print "" try: # get a suitable token and combine header github_token = get_token() headers = { 'User-Agent': 'Mozilla/5.0', 'Authorization': 'token ' + github_token, 'Content-Type': 'application/json', 'method': 'GET', 'Accept': 'application/vnd.github.squirrel-girl-preview+json' } # print "headers is: " + str(headers) # combine url, notice: per page is 30 url = "https://api.github.com/repos/" + owner + "/" + repo + "/contributors" url = url + "?anon=true" + "&page=" + str(page) print "url is: " + url insert_dict = {} try: # request data and parse response req = urllib2.Request( url=url, headers=headers ) response = urllib2.urlopen(req) result = json.loads(response.read().decode("utf-8")) # print result # judge response info empty length = len(result) sum += length if length == 0: # close the db connection cur.close() db.close() print "finish & the sum of issue of pull request is: " + str(sum) self.q.task_done() return # write file json_str = json.dumps(result) # print "json format data: " + json_str filename = base_path + "/" + owner + "&" + repo + "&" + str(id) + "/" + str(page) + ".json" flag = base.generate_file(filename, json_str) if flag is True: print "create file successfully: " + filename elif flag is False: print "file is already existed: " + filename else: print "create file failed: " + flag + " filename: " + filename continue page += 1 # page++ # handle response json data num = 0 while num < length: insert_dict = {} if "id" not in result[num]: insert_dict["id"] = None else: insert_dict["id"] = result[num]["id"] if "login" not in result[num]: insert_dict["login"] = None else: insert_dict["login"] = result[num]["login"] if "type" not in result[num]: insert_dict["type"] = None else: insert_dict["type"] = result[num]["type"] if "contributions" not in result[num]: insert_dict["contributions"] = None else: insert_dict["contributions"] = result[num]["contributions"] print "insert info: " + str(insert_dict) # insert data to database table try: # if insert data error, should keep on if insert_dict is not None: cur.execute("insert into github_contributor " "(id, login, owner_login, repo_id, repo, type, contributions) " "values (%s, %s, %s, %s, %s, %s, %s)", (insert_dict["id"], insert_dict["login"], owner, id, repo, insert_dict["type"], insert_dict["contributions"])) db.commit() except Exception as e: print str(e) num += 1 except urllib2.HTTPError as e: print str(e.code) + " error with this page: " + url if e.code != 404: # mainly 403, sometimes 503 # token rate limit self.q.put(work) # put into the queue again sleep_time_tokens[github_token] = time.time() # set sleep time for that token insert_dict = None else: insert_dict["body"] = "404 error" insert_dict["created_at"] = None insert_dict["updated_at"] = None else: pass # 403... error except Exception as e: print str(e) # unexpected error, don't interrupt the program