def update_github_user_table(): # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() filenames = base.read_all_filename_none_path(paths.github_all_user_sponsor_listing_info) for filename in filenames: # read data from file file_path = paths.github_all_user_sponsor_listing_info + "/" + filename text = base.get_info_from_file(file_path) if text is False: logging.fatal("file not existed: " + file_path) else: obj = json.loads(text) logging.info("read file: " + file_path) if obj["data"]["user"]["sponsorsListing"] is not None or ("hasSponsorsListing" in obj["data"]["user"] and ["hasSponsorsListing"] == True): try: cur.execute("update github_user " "set has_sponsors_listing=True " "where login='******'") db.commit() except Exception as e: logging.fatal(e) else: try: cur.execute("update github_user " "set has_sponsors_listing=FALSE " "where login='******'") db.commit() except Exception as e: logging.fatal(e) cur.close() db.close()
def get_all_user_earliest_maintainer_time( month, sql=sql.all_user_earliest_maintainer_time): # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # get get_all_user_earliest_maintainer_time sql1 = sql cur.execute(sql1) items = cur.fetchall() logging.info(str(items)) # generate time interval users_time_interval = [] for item in items: start_time = base.timestamp_to_time( base.datetime_to_timestamp(item[1]) - month) end_time = base.timestamp_to_time( base.datetime_to_timestamp(item[1]) + month) users_time_interval.append([item[0], start_time, item[1], end_time]) # generate activity change users_acticity_change = [] for item in users_time_interval: first_time = base.timestamp_to_time(item[1]) mid_time = base.timestamp_to_time(item[2]) last_time = base.timestamp_to_time(item[3]) logging.info("first_time: " + first_time + ", mid_time: " + mid_time + ", last_time: " + last_time) # close this database connection cur.close() db.close()
def writeUserRepository(path, sql): global base_path base_path = path workQueue = Queue.Queue() # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # read all the repos unhandled_tasks = [] cur.execute(sql) items = cur.fetchall() for item in items: unhandled_tasks.append({"login": item[0]}) logging.info("finish reading database") logging.info("%d tasks left for handling" % (len(unhandled_tasks))) # close this database connection cur.close() db.close() if len(unhandled_tasks) == 0: logging.warn("finish") return for task in unhandled_tasks: workQueue.put_nowait(task) for _ in range(THREAD_NUM): writeUserRepositoryThread(workQueue).start() workQueue.join() logging.info("finish")
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = os.listdir(directory) for file in files: file_path = directory + "/" + file text = base.get_info_from_file(file_path) if text is False: logging.warn("file not existed: " + file_path) else: obj = json.loads(text) logging.info("read file: " + file_path) count = 1 for edge in obj["data"]["user"][ "sponsorshipsAsMaintainer"]["edges"]: if edge["node"]["privacyLevel"] == "PRIVATE": cur.execute( "insert into github_sponsorships_as_maintainer " "(login, flag, created_at) " "values (%s, %s, %s)", (obj["data"]["user"]["login"], base.flag2, base.time_handler( edge["node"]["createdAt"]))) else: if "company" in edge["node"]["sponsorEntity"]: flag = base.flag0 else: flag = base.flag1 cur.execute( "insert into github_sponsorships_as_maintainer " "(login, sponsor_login, flag, created_at) " "values (%s, %s, %s, %s)", (obj["data"]["user"]["login"], edge["node"]["sponsorEntity"]["login"], flag, base.time_handler( edge["node"]["createdAt"]))) db.commit() logging.info("the " + str(count) + "th record in file: " + file_path) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e)
def run(self): while not self.q.empty(): work = self.q.get() logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("writing login data: " + login) if obj["data"]["user"]["hasSponsorsListing"] is True: cur.execute( "insert into github_user " "(database_id, login, name, email,spon_maintainer_count," " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["databaseId"], obj["data"]["user"]["login"], obj["data"]["user"]["name"], obj["data"]["user"]["email"], obj["data"]["user"] ["sponsorshipsAsMaintainer"]["totalCount"], obj["data"]["user"]["sponsorshipsAsSponsor"] ["totalCount"], base.time_handler(obj["data"]["user"]["createdAt"]), base.time_handler( obj["data"]["user"]["updatedAt"]), "1")) else: cur.execute( "insert into github_user " "(database_id, login, name, email,spon_maintainer_count," " spon_sponsor_count, created_at, updated_at, has_sponsors_listing) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["databaseId"], obj["data"]["user"]["login"], obj["data"]["user"]["name"], obj["data"]["user"]["email"], obj["data"]["user"] ["sponsorshipsAsMaintainer"]["totalCount"], obj["data"]["user"]["sponsorshipsAsSponsor"] ["totalCount"], base.time_handler(obj["data"]["user"]["createdAt"]), base.time_handler( obj["data"]["user"]["updatedAt"]), "0")) db.commit() logging.info(login + " ~~~~~~~~~ data commit into dababase success!!") self.q.task_done() cur.close() db.close()
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) if obj["data"]["user"]["sponsorsListing"] is None: logging.info("user: "******" don't create sponsors") else: cur.execute( "SELECT * FROM github_sponsor_listing WHERE login='******'") items = cur.fetchall() if len(items) == 1: logging.info("user: "******" had been inserted into database!") else: cur.execute( "insert into github_sponsor_listing " "(login, slug, name, tiers_total_count, created_at, short_description) " "values (%s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["login"], obj["data"] ["user"]["sponsorsListing"]["slug"], obj["data"]["user"]["sponsorsListing"] ["name"], obj["data"]["user"] ["sponsorsListing"]["tiers"]["totalCount"], base.time_handler( obj["data"]["user"]["sponsorsListing"] ["createdAt"]), obj["data"]["user"] ["sponsorsListing"]["shortDescription"])) db.commit() logging.info( login + " ~~~~~~~~~ data commit into dababase success!!" ) self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: file = base_path + "/" + login + ".json" text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) if obj["data"]["user"]["sponsorsListing"] is not None: logging.info(login + " ~~~~~~~~~ has " + str(obj["data"]["user"]["sponsorsListing"] ["tiers"]["totalCount"]) + " tiers") count = 1 for edge in obj["data"]["user"]["sponsorsListing"][ "tiers"]["edges"]: cur.execute( "insert into github_sponsor_listing_tiers " "(login, slug, monthly_price_in_cents, monthly_price_in_dollars, name, created_at, updated_at, description) " "values (%s, %s, %s, %s, %s, %s, %s, %s)", (obj["data"]["user"]["login"], obj["data"] ["user"]["sponsorsListing"]["slug"], edge["node"]["monthlyPriceInCents"], edge["node"]["monthlyPriceInDollars"], edge["node"]["name"], base.time_handler(edge["node"]["createdAt"]), base.time_handler(edge["node"]["updatedAt"]), edge["node"]["description"])) db.commit() # logging.info("the " + str(count) + "th tier data commit into dababase success!!") count += 1 else: logging.warn("login: "******" don't have sponsor_listing") logging.warn("sponsor_listing: " + str(obj)) self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) logging.error(e)
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = base.read_all_filename_in_directory(directory) for file in files: text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("read file: " + file) count = 1 if "edges" not in obj["data"]["user"][ "commitComments"]: continue for node in obj["data"]["user"]["commitComments"][ "edges"]: logging.info("the " + str(count) + "th record in file: " + file) if node["node"]["commit"] is not None: oid = node["node"]["commit"]["oid"] else: oid = "" cur.execute( "insert into github_commit_comment " "(comm_database_id, login, created_at, updated_at, body, commit_oid) " "values (%s, %s, %s, %s, %s, %s)", (node["node"]["databaseId"], obj["data"]["user"]["login"], base.time_handler(node["node"]["createdAt"]), base.time_handler(node["node"]["updatedAt"]), node["node"]["body"], oid)) db.commit() count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def update_github_user_flag(login, flag): # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # read all the repos cur.execute("update github_user " "set flag= " + str(flag) + " " "where login='******'") items = cur.fetchall() logging.info("finish reading database") # close this database connection cur.close() db.close()
def updateGithubSponsorshipsAsMaintainer(login, flag): # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # read all the repos cur.execute("update github_sponsorships_as_maintainer " "set flag= " + flag + " " "where sponsor_login='******'") items = cur.fetchall() logging.info("finish reading database") # close this database connection cur.close() db.close()
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = base.read_all_filename_in_directory(directory) for file in files: text = base.get_info_from_file(file) if text is False: logging.warn("file not existed: " + file) else: obj = json.loads(text) logging.info("read file: " + file) count = 1 for node in obj["data"]["user"][ "contributionsCollection"][ "pullRequestReviewContributions"]["edges"]: try: # maybe happen duplicate key when insert data cur.execute( "insert ignore into github_user_pr_review " "(pr_database_id, login, created_at, body) " "values (%s, %s, %s, %s)", (node["node"]["pullRequestReview"] ["databaseId"], node["node"] ["pullRequestReview"]["author"]["login"], base.time_handler( node["node"]["pullRequestReview"] ["createdAt"]), node["node"] ["pullRequestReview"]["body"])) db.commit() # logging.info("the " + str(count) + "th record in file: " + file) except Exception as e: logging.error(e) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e) return
def updateGithubUserFlag(login, flag): # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # read all the repos cur.execute("update github_user " "set flag= " + flag + " " "where login='******'") items = cur.fetchall() logging.info("update successfully! login: "******", flag: " + str(flag)) # close this database connection cur.close() db.close()
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = os.listdir(directory) for file in files: file_path = directory + "/" + file text = base.get_info_from_file(file_path) logging.info("login: "******", being handle file: " + file_path) if text is False: logging.warn("file not existed: " + file_path) else: obj = json.loads(text) # logging.info("read file: " + file_path) count = 1 for week in obj["data"]["user"][ "contributionsCollection"][ "contributionCalendar"]["weeks"]: for day in week["contributionDays"]: cur.execute( "insert ignore into github_user_commits_per_day " "(login, date, weekday, contribution_count) " "values (%s, %s, %s, %s)", (obj["data"]["user"]["login"], day["date"], day["weekday"], day["contributionCount"])) db.commit() # logging.info("the " + str(count) + "th record in file: " + file_path) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal("login: "******", fatal info: " + e) return
def insert_user_from_json_file(): # read all the users load_f = open('sponsorsListing_notnull.json', 'r') load_list = json.load(load_f) # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file for dict in load_list: logging.info(dict["login"]) try: cur.execute("insert into init_user " "(login) " "value ('" + dict["login"] + "')") db.commit() except Exception as e: logging.fatal(e) cur.close() db.close()
def insert_user_from_txt_file(): # read all the users f = open('users_with_sponsorList.txt', 'r') logins = f.read().strip().split("\n") # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file for username in logins: logging.info(username) try: cur.execute("insert into init_user " "(login) " "value ('" + username + "')") db.commit() except Exception as e: logging.fatal(e) cur.close() db.close()
def analyze_nums_change(username, sql, compare_name): times = get_time_range(username) first_time = base.timestamp_to_time(times[0]) mid_time = base.timestamp_to_time(times[1]) last_time = base.timestamp_to_time(times[2]) logging.info("first_time: " + first_time + ", mid_time: " + mid_time + ", last_time: " + last_time) # create database connection db = base.connectMysqlDB(config) cur = db.cursor() # get commit sum between first_time and mid_time sql1 = sql % (username, first_time, mid_time) cur.execute(sql1) items = cur.fetchall() sum1 = items[0][0] # get commit sum between mid_time and last_time sql2 = sql % (username, mid_time, end_time) cur.execute(sql2) items = cur.fetchall() sum2 = items[0][0] # close this database connection cur.close() db.close() # draw picture x1 = ["before sponsoring"] y1 = [sum1] x2 = ["after sponsoring"] y2 = [sum2] plt.bar(x1, y1, color='g', align='center') plt.bar(x2, y2, color='b', align='center') plt.title(compare_name) plt.show() logging.info("sum1: " + str(sum1) + ", sum2: " + str(sum2))
def get_time_range(username): try: # create database connection db = base.connectMysqlDB(config) cur = db.cursor() sql = "select min(created_at) \ from github_sponsorships_as_maintainer \ where login='******'" cur.execute(sql) times = cur.fetchall() earliest_sponsor_time = base.datetime_to_timestamp(times[0][0]) # close this database connection cur.close() db.close() except Exception as e: logging.error("get_time_range failed") logging.error(e) last_time = base.time_string_to_timestamp(end_time) first_time = 2 * earliest_sponsor_time - last_time mid_time = earliest_sponsor_time return [first_time, mid_time, last_time]
def run(self): while not self.q.empty(): work = self.q.get(timeout=0) logging.info("the number of work in queue: " + str(self.q.qsize())) login = work["login"] # get db connection db = base.connectMysqlDB(config, autocommit=False) cur = db.cursor() # read data from file try: directory = base_path + "/" + login files = os.listdir(directory) for file in files: file_path = directory + "/" + file text = base.get_info_from_file(file_path) if text is False: logging.warn("file not existed: " + file_path) continue obj = json.loads(text) print "read file: " + file_path count = 1 # github user 接受了打赏,但是没有打赏过别人。 # 之所以将这部分数据写入 github_sponsorships_as_sponsor 表中,是为了做筛选 if len(obj["data"]["user"]["sponsorshipsAsSponsor"] ["edges"]) == 0: logging.warn("the user " + login + " doesn't sponsor others") cur.execute( "insert into github_sponsorships_as_sponsor " "(login, sponsor_login, flag) " "values (%s, %s, %s)", (login, login, str(base.flag4))) db.commit() continue for edge in obj["data"]["user"]["sponsorshipsAsSponsor"][ "edges"]: if edge["node"]["privacyLevel"] == "PRIVATE": logging.info("the " + str(count) + "th record is private in file: " + file_path) count += 1 continue else: slug = edge["node"]["sponsorable"][ "sponsorsListing"]["slug"].split("-")[1] cur.execute( "insert into github_sponsorships_as_sponsor " "(login, slug, sponsor_login, flag, created_at) " "values (%s, %s, %s, %s, %s)", (slug, edge["node"]["sponsorable"] ["sponsorsListing"]["slug"], obj["data"]["user"]["login"], str(3), base.time_handler(edge["node"]["createdAt"]))) db.commit() logging.info("the " + str(count) + "th record in file: " + file_path) count += 1 self.q.task_done() cur.close() db.close() except Exception as e: logging.fatal(e)