Ejemplo n.º 1
0
def a(db_conn, vuln_list):

  query_base = \
"select sum(vulns) / count(distinct repo_id) from \
(select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) vulns \
from gh_repo r inner join \
gh_repo_contributors rc on r.repo_id = rc.repo_id \
inner join gh_file f on r.repo_id = f.repo_id \
left join gh_vuln v on f.file_id = v.file_id \
where (v.vuln_desc in (" + '\'' + '\', \''.join(vuln_list) + '\'' + ") or v.vuln_desc is null)\
and r.repo_size >= %s and r.repo_size < %s \
group by r.repo_id "
  t1 = 5
  t2 = 10
  avg_vulns_under = []
  ra_under = []
  avg_vulns_mid = []
  ra_mid = []
  avg_vulns_over = []
  ra_over = []
  for i in range(0,1000,100):
    rows = sd.select_many_query(db_conn, query_base +\
" having count(distinct rc.username) <= %s) a", [i, i+100, t1])
    if rows[0]:
      ra_under.append(i);
      print "appending ", i
      avg_vulns_under.append(rows[0][0])
    
    rows = sd.select_many_query(db_conn, query_base +\
" having count(distinct rc.username) > %s \
and count(distinct rc.username) <= %s) a", [i, i+100, t1, t2])
    if rows[0]:
      ra_mid.append(i);
      print "appending ", i
      avg_vulns_mid.append(rows[0][0])
    
    rows = sd.select_many_query(db_conn, query_base +\
" having count(distinct rc.username) > %s) a", [i, i+100, t2])
    if rows[0]:
      ra_over.append(i);
      print "appending ", i
      avg_vulns_over.append(rows[0][0])
  l_under, = plt.plot(ra_under, avg_vulns_under, label="under " + str(t1) + ", ".join(vuln_list))
  l_mid, =  plt.plot(ra_mid, avg_vulns_mid, label="between " + str(t1) + " and " + str(t2) + ", ".join(vuln_list))
  l_over, = plt.plot(ra_over, avg_vulns_over, label="over " + str(t2) + ", ".join(vuln_list))
  plt.legend(handles=[l_under, l_mid, l_over])#, ["over", "under"])
  return [ra_under, avg_vulns_under, ra_mid, avg_vulns_mid, ra_over, avg_vulns_over , [l_under, l_mid, l_over]]
Ejemplo n.º 2
0
def get_analysis(output_filename, vuln_list, alt_list):
  # get database connection (local for now)
  outfile = open(output_filename, "wb");
  id_types = ["r.repo_id", "f.file_id", "v.vuln_id"]
  id_names = ["number of repos", "number of files", "total occurrences"]
  fieldnames = ["vulnerability"] + id_names
  writer = csv.DictWriter(outfile, fieldnames=fieldnames);
  writer.writeheader();

  total_list = vuln_list + alt_list
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  for vuln in total_list:# vuln_list + alt_list:
    tmp_result = {"vulnerability": vuln};
    for i in range(0, len(id_types)):
#get number of distinct {repo, file, vuln} containing a particular vulnerability
#don't escape the id type
      rows = sd.select_many_query(db_conn, \
" select count(distinct " + id_types[i] + ")  \
 from \
   gh_repo r  \
   inner join gh_file f on f.repo_id = r.repo_id  \
   inner join gh_vuln v on f.file_id = v.file_id  \
   where vuln_desc = %s ", \
   [vuln]);
      for row in rows:
        tmp_result[id_names[i]] = row[0];
    writer.writerow(tmp_result)

  #Next, we'll want to look into co-occurrence of different types of vulnerabilities
  for i in range(0, len(total_list)):
    for j in range(i+1, len(total_list)):
      rows = sd.select_many_query(db_conn, \
"  select count(distinct r.repo_id) \
  from gh_repo r \
  inner join gh_file f1 on r.repo_id = f1.repo_id \
  inner join gh_vuln v1 on f1.file_id = v1.file_id \
  inner join gh_file f2 on r.repo_id = f2.repo_id \
  inner join gh_vuln v2 on f2.file_id = v2.file_id \
  and v1.vuln_desc = %s \
  and v2.vuln_desc = %s ", [total_list[i], total_list[j]]);
      for row in rows:
        print total_list[i], ", ", total_list[j], ", ", row[0]
Ejemplo n.º 3
0
def get_repo_cuser_histogram1():
#contributors per repo
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  rows = sd.select_many_query(db_conn, \
  "select repo_id, count(distinct username) from gh_repo_contributors group by repo_id order by count(distinct username);")
  contributors = []
  for row in rows:
    contributors.append(row[1])

  n, bins, patches = plt.hist(contributors, 50)
  l = plt.plot(bins)
  plt.show()
  plt.clf()
Ejemplo n.º 4
0
def get_graphs(vuln_list, alt_list):
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  #b(db_conn, vuln_list)
  #for vuln in vuln_list:
    #b(db_conn, [vuln])
  c1(db_conn, ["strcpy"], ["strncpy"])
  c1(db_conn, ["strcpy"], ["strlcpy"])
  

  #also correlation between repo size and vuln types/frequency
  #trends over time? not sure how to measure. maybe time last pushed
  #also correlation between number of contributors and vulnerability content

#okay, other analysis we'd like to do:
#ummmm....shit this is not going well
# we could look at the number of contributors vs. vuln count/vuln types - that would be a nice thing to graph
  #rows = sd.select_many_query(db_conn, \
  rows = sd.select_many_query(db_conn, \
"select r.repo_id, count(distinct rc.username), count(distinct v.vuln_id) \
from gh_repo r inner join \
gh_repo_contributors rc on r.repo_id = rc.repo_id \
inner join gh_file f on r.repo_id = f.repo_id \
inner join gh_vuln v on f.file_id = v.file_id \
where v.vuln_desc in (%s) \
group by r.repo_id \
", ['\'' + '\', \''.join(vuln_list) + '\''])



  #rows = sd.select_many_query(db_conn, \
  print \
"select r.repo_id, count(distinct rc.username), count(distinct v.vuln_desc) \
from gh_repo r inner join \
gh_repo_contributors rc on r.repo_id = rc.repo_id \
inner join gh_file f on r.repo_id = f.repo_id \
inner join gh_vuln v on f.file_id = v.file_id \
where v.vuln_desc in (%s) \
group by r.repo_id" % '\'' + '\', \''.join(vuln_list) + '\''
#group by r.repo_id", '\'' + '\', \''.join(vuln_list) + '\'')

# could look at proportion of repos a user contributes to that have vulns vs no-vulns

# create a view that has gh_vuln with the vulnerabilities, and another one that has the safe versions
  print \
"select rcv.username, count(distinct rcv.repo_id), count(distinct rcnv.repo_id) \
from gh_repo_contributors rcv \
inner join \
gh_repo_contributors rcnv on rcv.username = rcnv.username \
where rcnv.repo_id not in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \
and rcv.repo_id in (select f.repo_id from gh_file f inner join gh_vuln v on f.file_id = v.vuln_id where v.vuln_desc in (%s)) \
" % '\'' + '\', \''.join(vuln_list) + '\'', '\'' + '\', \''.join(vuln_list) + '\'' 
Ejemplo n.º 5
0
def execute_query_over_range(db_conn, query, args):
  avg = []
  stddev = []
  x_axis = []
  for i in range(0,1000,100):
    rows = sd.select_many_query(db_conn, query, [i, i+100] + args)
    #query_base + " having count(distinct rc.username) <= %s) a", args)#[i, i+100, t1])
    vulncount = []
    for row in rows:
      vulncount.append(row[1])
    avg.append(np.average(vulncount))
    stddev.append(np.std(vulncount))
    x_axis.append(i)
  #n, bins, patches = plt.hist(vulncount, 50)
  #l = plt.plot(bins)
  #plt.show()
  #plt.clf()
  return [avg, stddev, x_axis]
Ejemplo n.º 6
0
def get_repo_cuser_histogram2():
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
# get number of repos with each number of different contributors
  rows = sd.select_many_query(db_conn, \
"select count(distinct repo_id), num_contribs from ( select repo_id, count(distinct username) num_contribs from gh_repo_contributors group by repo_id order by count(distinct username) ) a group by num_contribs")
  repos = []
  contributing_users = []
  for i in range(0, len(rows)):
    repos.append(rows[i][0])
    contributing_users.append(rows[i][1])
    print rows[i][0], rows[i][1]
  print "3-10", np.sum(repos[2:10])
  print "11-20", np.sum(repos[10:20])
  print "21-30", np.sum(repos[20:31])
  print ">1", np.sum(repos[1:-1])
#  plt.plot(repos_contributed_to, users)
#  plt.show()
  return[repos, contributing_users]
Ejemplo n.º 7
0
def get_user_crepo_histogram():
# get number of users who have each number of different repos they contribute to
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
  rows = sd.select_many_query(db_conn, \
"select count(distinct username), repo_contribs from ( select username, count(distinct repo_id) repo_contribs from gh_repo_contributors group by username order by count(distinct repo_id) ) a group by repo_contribs");
  users = []
  repos_contributed_to = []
  for i in range(0, len(rows)):
    users.append(rows[i][0])
    repos_contributed_to.append(rows[i][1])
    print rows[i][0], rows[i][1]
#  plt.plot(repos_contributed_to, users)
  print "3-10", np.sum(users[2:10])
  print "11-20", np.sum(users[10:20])
  print "21-30", np.sum(users[20:-1])
  print ">1", np.sum(users[1:-1])
#  plt.show()
  return[users, repos_contributed_to]
Ejemplo n.º 8
0
def get_analysis_threshold(vuln_list, alt_list, threshconds):
  db_conn = sd.get_connection("mysqlcreds-local-analysis.csv");
#  for vuln in vuln_list + alt_list:
#    tmp_result = {"vulnerability": vuln};
#    for thresh in threshconds:
#      rows = sd.select_many_query(db_conn, \
#" select count(distinct " + "repo_id" + ")  \
# from \
#   (select r.repo_id \
#   from gh_repo r  \
#   inner join gh_file f on f.repo_id = r.repo_id  \
#   inner join gh_vuln v on f.file_id = v.file_id  \
#   inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \
#   where vuln_desc = %s \
#   group by r.repo_id \
#   having count(distinct rc.username) " + thresh + \
#   ") a", \
#   [vuln]);
#      for row in rows:
#        print vuln, row[0], thresh

  #Next, we'll want to look into co-occurrence of different types of vulnerabilities
  total_list = vuln_list + alt_list;
  for i in range(0, len(total_list)):
    for j in range(i+1, len(total_list)):
      for thresh in threshconds:
        rows = sd.select_many_query(db_conn, \
  "  select count(distinct repo_id) \
    from \
    (select distinct rc.repo_id from \
    gh_repo r \
    inner join gh_file f1 on r.repo_id = f1.repo_id \
    inner join gh_vuln v1 on f1.file_id = v1.file_id \
    inner join gh_file f2 on r.repo_id = f2.repo_id \
    inner join gh_vuln v2 on f2.file_id = v2.file_id \
    inner join gh_repo_contributors rc on r.repo_id = rc.repo_id \
    and v1.vuln_desc = %s \
    and v2.vuln_desc = %s \
    group by rc.repo_id \
    having count(distinct rc.username) " + thresh +\
    ") a", \
    [total_list[i], total_list[j]]);
        for row in rows:
          print total_list[i], ", ", total_list[j], ", ", row[0], ", ", thresh
Ejemplo n.º 9
0
import save_data as sd
import requests
import json
import util
import sys

in_order = False;
token = os.environ.get('P_TOKEN', None)

#db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv')
db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv')
rows = None;
if (in_order):
	rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id")
else: #get the stragglers
	rows = sd.select_many_query(db_conn, "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id")

header = {'Authorization': 'token ' + token}

for row in rows:
	repo_id = row[0];
	if (repo_id % 10 == 0):
		print "repo_id ", repo_id
	query_url = row[1];
	owner_name = row[2];
	try:
		r = requests.get(query_url, headers=header)
		item = json.loads(r.text or r.content)
		for thing in item:
			contributions = thing['contributions']
			username = thing['login']
Ejemplo n.º 10
0
import save_data as sd
import requests
import json
import util
import sys

in_order = False
token = os.environ.get('P_TOKEN', None)

#db_conn = sd.get_connection(credentials_file='mysqlcreds-remote.csv')
db_conn = sd.get_connection(credentials_file='mysqlcreds-analysis.csv')
rows = None
if (in_order):
    rows = sd.select_many_query(
        db_conn,
        "select repo_id, contributors_url, owner_name from gh_repo where repo_id >= (select coalesce(max(repo_id), 0) from gh_repo_contributors) order by repo_id"
    )
else:  #get the stragglers
    rows = sd.select_many_query(
        db_conn,
        "select repo_id, contributors_url, owner_name from gh_repo where repo_id not in (select repo_id from gh_repo_contributors) order by repo_id"
    )

header = {'Authorization': 'token ' + token}

for row in rows:
    repo_id = row[0]
    if (repo_id % 10 == 0):
        print "repo_id ", repo_id
    query_url = row[1]
    owner_name = row[2]