def import_to_database (json_dict, credentials_file): # number_of_files = 0 db_conn = sd.get_connection(credentials_file); # start = json_dict["start"] # end = json_dict["end"] # json_dict = json_dict["result"] for username, repo in json_dict.iteritems(): sd.save_user_data(db_conn, username); #print username for reponame, repo_dict in repo.iteritems(): date_created = repo_dict["created_at"] repo_size = repo_dict["size"] last_pushed = repo_dict["pushed_at"] contributors_url = repo_dict["contributors_url"] description = repo_dict["description"] repo_id = sd.save_repo_data(db_conn, reponame, date_created, username, repo_size, last_pushed, repo_dict["url"], repo_dict["forks_url"], contributors_url, description, repo_dict["stargazers"], repo_dict["forks"]); file_list = repo_dict["files"]; for file_entry in file_list: #print file_info for filename, datapoints in file_entry.iteritems(): file_id = sd.save_file_data(db_conn, filename, repo_id, "") for thing in datapoints: line = thing["line"] code_sample = thing["code_sample"] sd.save_vulnerability_data(db_conn, file_id, line, code_sample);
def test(credentials_file='mysqlcreds-throwaway.csv'): con = sd.get_connection(credentials_file) username = "******" sd.save_user_data(con, username, "*****@*****.**") #repo_id = save_repo_data("test_repo", date_created, user_id, repo_size, date_collected); #fill in with appropriate data types repo_id = sd.save_repo_data(con, "test_repo", datetime.date.today(), username, 2400, datetime.date.today()) sd.save_repo_contributor_data(con, username, repo_id) file_id = sd.save_file_data(con, "test_file.c", repo_id, "") #optional date and author parameters vuln_id = sd.save_vulnerability_data(con, file_id, 24, "code sample;", "vulnerability description/regex") sd.close_connection(con)
import load_data as L import save_data as S import get_info as G import tr_te_split as T import extract_feature as E mv_lens_100k = L.load_data('100k') user_info = mv_lens_100k.load_user_info() movie_info = mv_lens_100k.load_movie_info() rating_info = mv_lens_100k.load_ratings() # year_info = G.get_year_info(movie_info) # age_info = G.get_age_info(user_info) data = E.extract_feature(user_info, movie_info, rating_info) tr_data, te_data = T.tr_te_split(data) S.save_user_data(tr_data, te_data)
else: #get the stragglers rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id") header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0]; if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1]; owner_name = row[2]; try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login'] sd.save_user_data(db_conn, username); sd.save_repo_contributor_data(db_conn, username, repo_id, contributions); headers = r.headers; ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) if (ratelimit_remaining % 10 == 0): print "ratelimit_remaining ", ratelimit_remaining if ratelimit_remaining == 0: print "napping for ", reset_time util.nap(reset_time) except:# ValueError, requests.exceptions.ConnectionError: print "error: ", sys.exc_info()[0] print "skipping repo: ", repo_id
) header = {'Authorization': 'token ' + token} for row in rows: repo_id = row[0] if (repo_id % 10 == 0): print "repo_id ", repo_id query_url = row[1] owner_name = row[2] try: r = requests.get(query_url, headers=header) item = json.loads(r.text or r.content) for thing in item: contributions = thing['contributions'] username = thing['login'] sd.save_user_data(db_conn, username) sd.save_repo_contributor_data(db_conn, username, repo_id, contributions) headers = r.headers ratelimit_remaining = int(headers['x-ratelimit-remaining']) reset_time = int(headers['x-ratelimit-reset']) if (ratelimit_remaining % 10 == 0): print "ratelimit_remaining ", ratelimit_remaining if ratelimit_remaining == 0: print "napping for ", reset_time util.nap(reset_time) except: # ValueError, requests.exceptions.ConnectionError: print "error: ", sys.exc_info()[0] print "skipping repo: ", repo_id