def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): val = reg_china_condition() result = db["user"].find(val) count = 0 for item in result: count += 1 add_chinese(db, item) if count % 500 == 0: print count #TODO make it a lib old_res = db["research_result"].find_one({"type": "chinese_count"}) if old_res: db["research_result"].update({"type": "chinese_count"}, {"$set": { "total_count": count }}) else: db["research_result"].insert({ "type": "chinese_count", "total_count": count }) else: print "Cannot connect to database"
def main (): dm_db = DMDatabase() db = dm_db.getDB() if (db): report_lang(db) else: print "Cannot connect to database"
def init_users_thread_by_query(db, query): pieces = 100 db = DMDatabase().getDB() i = 0 res = db["user_contributor_result2"].find(query) user_list = [] for item in res: i += 1 user_list.append(item["login"]) if i % pieces == 0: task = DMTask() val = { "name": "get_commit_check", "action_type": "loop", "query": str(query), "users": user_list, "start": 800000 + i - 100, "end": 800000 + i } task.init("github", val) user_list = [] if i % pieces != 0: task = DMTask() val = { "name": "get_commit_check", "action_type": "loop", "query": str(query), "users": user_list, "start": 800000 + i - i % pieces, "end": 800000 + 1 } task.init("github", val) return
def fix_add_count_id_created_at_int(): db = DMDatabase().getDB() #2730627 gap = 1000 start = 0 # end id is now set to 10300000 end = 10300 for i in range(start, end): res = db["user"].find({"id": {"$gte": i * gap, "$lt": (i + 1) * gap}}) for item in res: old_item = db["followers"].find_one({"login": item["login"]}) if old_item: if old_item.has_key("created_at_int") and old_item.has_key( "id") and old_item.has_key("count"): continue else: if old_item.has_key("count"): db["followers"].update({"login": item["login"]}, { "$set": { "created_at_int": item["created_at_int"], "id": item["id"] } }) else: db["followers"].update({"login": item["login"]}, { "$set": { "created_at_int": item["created_at_int"], "id": item["id"], "count": item["followers"] } }) print i
def top_fork(): db = DMDatabase().getDB() num = 1000 res = db["repositories"].find().sort("forks_count", pymongo.DESCENDING).limit(num) for item in res: if item.has_key("contributors_count"): continue full_name = item["full_name"] id = item["id"] ret_val = top_get_contributors(db, full_name, id) if ret_val["error"] == 1: pass else: count = len(ret_val["val"]) db["contributors"].insert({ "full_name": full_name, "id": id, "contributors": ret_val["val"], "count": count, "update_date": datetime.datetime.utcnow() }) db["repositories"].update({ "full_name": full_name, "id": id }, {"$set": { "contributors_count": count }}) print "insert " + full_name + "with " + str(count)
def fix_add_login_one_by_one(): db = DMDatabase().getDB() #2730627 i = 0 last_id = "<null>" while 1: res =db["user_repos"].find_one({"id": {"$exists": False}}) if res: i += 1 item = db["user"].find_one({"login": res["login"]}) if item: if last_id == res["login"]: print res["login"] +" dup, removed" db["user_repos"].remove({"login": res["login"]}) else: print res["login"] + " updated " created_at_int = 0 if item.has_key("created_at_int"): created_at_int = item["created_at_int"] else: created_at_int = date_string_to_int(item["created_at"]) db["user_repos"].update({"login": res["login"]}, {"$set": {"created_at_int": created_at_int, "id": item["id"], "count": item["public_repos"]}}) last_id = res["login"] else: print res["login"] + " is not found" # if we get followers and not sync with user, this problem happens db["user_repos"].remove({"login": res["login"]}) else: print 'exit' return if i%1000 == 0: print i
def fix_user_loop(): db = DMDatabase().getDB() total = 1050 gap_num = 10000 i = 0 while i < total: min = i * gap_num max = (i + 1) * gap_num query = {"id": {"$gte": min, "$lt": max}} res = db["user"].find(query) for item in res: if item.has_key("created_at_int"): continue created_at_string = item["created_at"] updated_at_string = item["updated_at"] created_at_int = date_string_to_int(created_at_string) updated_at_int = date_string_to_int(updated_at_string) db["user"].update({"login": item["login"]}, { "$set": { "created_at_int": created_at_int, "updated_at_int": updated_at_int } }) i += 1 print i print "Finish"
def main (): dm_db = DMDatabase() db = dm_db.getDB() if (db): calculate_months(db) else: print "Cannot connect to database"
def main(type): timeout = 300 socket.setdefaulttimeout(timeout) print "Account has " + str(DMSharedUsers().getRemaining()) + " API calls" db = DMDatabase().getDB() if db: if type == "user": user = "******" get_commit_repos_by_user(db, user) if type == "user_repos": user = "******" get_commit_repos_by_user_repos(db, user) elif type == "query": query = {"contributor_repos": {"$gte": 200}} get_commit_repos_by_query(db, query) elif type == "init_task": query = {"contributor_repos": {"$gte": 100, "$lt": 200}} init_users_thread_by_query(db, query) elif type == "run_task": run_free_task(db, 60) elif type == "un_user": get_unfinished_users(db) elif type == "un_repo": get_unfinished_repos(db)
def get_unfinished_repos(db): client = DMDatabase().getClient() res = client["task"]["github"].find({ "status": "running", "name": "get_commit_check" }) repos = [] for item in res: for item_user in item["users"]: if db["commit_check_meta_result"].find_one({"login": item_user}): continue else: user_res = db["user_contributor_result2"].find_one( {"login": item_user}) if user_res: repo_list = user_res["repo_list"] repos += repo_list unfinish_repos = [] for item in repos: unfinish_repos.append(item["full_name"]) unfinish_repos = list(set(unfinish_repos)) repos = [] for item in unfinish_repos: if item.startswith("GITenberg/"): continue if db["commit_check_meta_result"].find_one({"full_name": item}): continue else: repos.append(item) for repo in repos: new_thread = myThread2(db, repo) user_thread.append(new_thread) print str(len(repos)) + " task received, start to run them!" run_task()
def init_commit_task_by_user(): db = DMDatabase().getDB() user = "******" res = db["user_contributor_result2"].find_one({"login": user}) if res: repo_list = res["repo_list"] for repo in repo_list: init_commit_task(db, repo["full_name"])
def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): nums = [100, 200, 1000, 10, 50] for num in nums: fork_org(db, num) print "Cannot connect to database"
def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): for num in range(0, 76000): report_forks(db, num) else: print "Cannot connect to database"
def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): report_active(db, "active_count") report_active(db, "active_count_3_month") else: print "Cannot connect to database"
def main_int (): dm_db = DMDatabase() db = dm_db.getDB() if (db): # print_user_info(db, "initlove") # calculate_months_int(db) merge_month(db, 201412) else: print "Cannot connect to database"
def init_repo_single_task(login): db = DMDatabase().getDB() task = DMTask() item = db["user"].find_one({"login": login}) if item: val = {"name": "get_repos", "action_type": "single", "start": login, "end": item["id"]} task.init("github", val) else: print "user not found"
def main(type): timeout = 300 socket.setdefaulttimeout(timeout) db = DMDatabase().getDB() if db: if type == "top": get_top_commit_repos(db) elif type == "dup": get_dup_repos(db)
def init_event_task(): # TODO: 1000 is system defined, maybe add to DMTask? or config file? gap = 1000 start = 0 # end id is now set to 10300000 end = 10300 db = DMDatabase().getDB() for i in range (start, end): task = DMTask() val = {"name": "get_events", "action_type": "loop", "start": i * gap, "end": (i+1)*gap} task.init("github", val)
def main (): dm_db = DMDatabase() db = dm_db.getDB() if (db): fo = open("./lang.txt", "r") for line in fo.readlines(): line = line.strip() if not len(line) or line.startswith('#'): continue else: generate_lang(db, line) else: print "Cannot connect to database"
def main(): dm_db = DMDatabase() client = dm_db.getClient() db = dm_db.getDB() if (client): login = "******" init_beginer(client, login, 0) for level in range(1, 10): if get_followers(client, db, level) == 0: print "No more, exit" break else: print "Cannot connect to database"
def fix_add_created_at_int(): db = DMDatabase().getDB() #2730627 gap = 1000 start = 7866 # end id is now set to 10300000 end = 9000 for i in range(start, end): res = db["user"].find({"id": {"$gte": i * gap, "$lt": (i+1)*gap}}) for item in res: db["event"].update({"login": item["login"]}, {"$set": {"created_at_int": item["created_at_int"]}}) print i
def main(): dm_db = DMDatabase() client = dm_db.getClient() if (client): #user name is a database #repo_name is col to store repos that #user is another col to store contributors' info user_name = "openstack" full_name = "openstack/horizon" repo_name = "horizon" # generate_small_by_repo(client, user_name, repo_name) report_small_by_repo(client, user_name, repo_name) else: print "Cannot connect to database"
def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): res = db["event"].find({"count": {"$gte": 299}}) org_count = 0 for item in res: user_item = db["user"].find_one({"id": item["id"]}) if user_item: print user_item["login"] if user_item["type"] == "Organization": org_count += 1 print "Org is " + str(org_count) else: print "Cannot connect to database"
def resolve_contributors_30(): db = DMDatabase().getDB() task1 = DMTask() val = {"name": "fake-contributors", "action_type": "loop", "start": 0, "end": 1000} task1.init_test("github", val) e1 = GithubContributors(task1) for num in range(1, 300): res = db["contributors"].find({"count": num*30}) res_list = [] for item in res: res_list.append({"full_name": item["full_name"], "id": item["id"]}) for item in res_list: e1.get_repo_contributors(item["full_name"], item["id"])
def test(): db = DMDatabase().getDB() res = db["event"].find({"id": {"$gte": 1000, "$lt": 200}}).limit(20) if res is None: print 'res is none' else: print res.count() return task1 = DMTask() val = {"name": "fake-event", "action_type": "loop", "start": 6001000, "end": 6005000} task1.init_test("github", val) e1 = GithubEvent(task1) e1.runTask() task1.remove()
def get_unfinished_users(db): client = DMDatabase().getClient() res = client["task"]["github"].find({ "status": "running", "name": "get_commit_check" }) users = [] for item in res: for item_user in item["users"]: if db["commit_check_meta_result"].find_one({"login": item_user}): print "exist" else: users.append(item_user) for user in users: new_thread = myThread1(db, user) user_thread.append(new_thread) print str(len(users)) + " task received, start to run them!" run_task()
def fix_user(): db = DMDatabase().getDB() i = 0 query = {"updated_at_int": {"$exists": False}} res = db["user"].find(query) for item in res: created_at_string = item["created_at"] updated_at_string = item["updated_at"] created_at_int = date_string_to_int(created_at_string) updated_at_int = date_string_to_int(updated_at_string) db["user"].update({"login": item["login"]}, { "$set": { "created_at_int": created_at_int, "updated_at_int": updated_at_int } }) i += 1 print i print "Finish"
def fix_add_login_one_by_one(): db = DMDatabase().getDB() #2730627 i = 0 last_id = "<null>" while 1: res = db["followers"].find_one({"login": {"$exists": False}}) if res: i += 1 db["followers"].remove({"_id": res["_id"]}) print str(i) + " removed" else: break print "login all exists" i = 0 while 1: res = db["followers"].find_one({"id": {"$exists": False}}) if res: i += 1 item = db["user"].find_one({"login": res["login"]}) if item: if last_id == res["login"]: print res["login"] + " dup, removed" db["followers"].remove({"login": res["login"]}) else: print res["login"] + " updated " db["followers"].update({"login": res["login"]}, { "$set": { "created_at_int": item["created_at_int"], "id": item["id"], "count": item["followers"] } }) last_id = res["login"] else: print res["login"] + " is not found" # if we get followers and not sync with user, this problem happens db["followers"].remove({"login": res["login"]}) else: print 'exit' return if i % 1000 == 0: print i
def resolve_event_errors(): client = DMDatabase().getClient() res = client["task"]["github"].find({ "name": "get_repositories", "error_count": { "$gte": 10 } }) count = 0 for item in res: task = DMTask() val = { "name": "get_repositories", "action_type": "loop", "start": item["start"], "end": item["end"] } task.init("github", val) r = GithubRepositories(task) res = r.error_check() count += res print str(count) + " errors solved"
def main(): dm_db = DMDatabase() db = dm_db.getDB() if (db): val = active_user(db) count = db["chinese"].find(val).count() #TODO make it a lib old_res = db["research_result"].find_one( {"type": "chinese_active_count"}) if old_res: db["research_result"].update({"type": "chinese_active_count"}, {"$set": { "total_count": count }}) else: db["research_result"].insert({ "type": "chinese_active_count", "total_count": count }) else: print "Cannot connect to database"