def generateSimilarityMatrix(cfg, features): print("\n===========\nRUNNING generateSimilarityMatrix()\n===========\n") con, cur = getDbConnection(cfg) ### Load pairs of GH users and SO users cur.execute(''' select g_id, s_id from labeled_data order by g_id, s_id ''') pairs = [(x[0], x[1]) for x in cur.fetchall()] ### Matrix generation S = lil_matrix((len(pairs), len(features))) with_tags = False for attr in features: print("Processing {} similarity".format(attr)) if attr == "tags": with_tags = True cur.execute(''' select g_id, s_id, similarity from similarities_among_%s ''' % attr) for c in cur.fetchall(): if (c[0], c[1]) in pairs: S[pairs.index((c[0], c[1])), features.index(attr)] = c[2] file_append = "with_tags" if with_tags else "without_tags" print("Writing similarity matrix to file") root_dir = os.path.join(os.path.dirname(__file__), "../") io.mmwrite(root_dir + 'data/{}/s.mtx'.format(file_append), S) print("Closing connection") cur.close() con.close() print("=========End generateSimilarityMatrix()========")
def runSqlScripts(cfg, train_size, test_size): print("\n===========\nRUNNING runSqlScripts()\n===========\n") con, cur = getDbConnection(cfg) root_dir = os.path.join(os.path.dirname(__file__), "../") print("Running createTables.sql script") with open(root_dir + "sqlscripts/createTables.sql", 'r') as r: query = r.read() cur.execute(query) con.commit() print("Running user_project_dec_mapping.sql script") with open(root_dir + "sqlscripts/user_project_dec_mapping.sql", 'r') as r: query = r.read() cur.execute(query) con.commit() print("Running createTrainAndTestDatasetView.sql script") with open(root_dir + "sqlscripts/createTrainAndTestDatasetView.sql", 'r') as r: query = r.read() query = query.replace("<train_size>", str(train_size // 2)).replace("<test_size>", str(test_size)) cur.execute(query) con.commit() cur.close() con.close() print("train_size: {}. test_size: {}\n".format(str(train_size), str(test_size))) print("========End runSqlScripts()=======")
def generatePredictionsCsvFile(cfg, file_append): print("\n===========\nRUNNING generatePredictionsCsvFile()\n===========\n") con, cur = getDbConnection(cfg) cur.execute(''' SELECT p.g_id, p.s_id, l.label, p.model, p.pred, p.proba FROM predictions p LEFT JOIN labeled_data_test l ON p.g_id = l.g_id and p.s_id = l.s_id ''') with open(root_dir + 'data/{}/predictions_all.csv'.format(file_append), 'w') as w: w.write("g_id,s_id,true_label,model,predicted_label,probability\n") for row in cur.fetchall(): w.write(",".join([str(r) for r in row]) + "\n") cur.close() con.close()
def generateDateSimilarity(cfg, redoSimilarity = False): print("\n===========\nRUNNING generateDateSimilarity()\n===========\n") con, cur = getDbConnection(cfg) # check if done before if redoSimilarity: cur.execute('delete from similarities_among_dates') con.commit() else: cur.execute('select g_id from similarities_among_dates limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("similarities_among_dates has already been generated") return print("created table similarities_among_dates") ### Load user pairs with date cur.execute(''' select distinct g.id, s.id, g.created_at::date, s.creation_date from users g, labeled_data l, so_users s where g.created_at is not null and g.id = l.g_id and s.creation_date is not null and s.id = l.s_id ''') pairs = cur.fetchall() for p in pairs: sim = computeDateSim(p[2], p[3]) cur.execute(''' insert into similarities_among_dates values (%s, %s, %s) ''', (p[0], p[1], sim)) con.commit() cur.close() con.close() print("=============End=============")
def generateTagsSimilarity(cfg, redoSimilarity=False): print("\n===========\nRUNNING generateTagsSimilarity()\n===========\n") con, cur = getDbConnection(cfg) # check if done before if redoSimilarity: cur.execute('delete from similarities_among_tags') con.commit() else: cur.execute('select g_id from similarities_among_tags limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("similarities_among_tags has already been generated") return print("created table similarities_among_tags") # GH user tags gh_user_tags = getGHUserTags(cur, "labeled_data") cur.execute('select distinct lower(language) from projects where language is not null') tags = set([r[0] for r in cur.fetchall()]) print("{} tags to be considered for our work are now loaded".format(len(tags))) # SO user tags so_user_tags = getSOUserTags(cur, tags, "labeled_data") users_tags = [] users_tags.extend(gh_user_tags.values()) users_tags.extend(so_user_tags.values()) print("Combined tags list. Total: {} (with duplicates)".format(len(users_tags))) name_trigrams, trigrams = buildTrigram(set(users_tags)) vectors = vectorizeNamesByTrigram(trigrams, name_trigrams) # print("\nvectors") # print(vectors) print("\nConstructed vectors for vectorizing tag-list by trigram. length of vectors dict: {}".format(len(vectors))) # similarities between tags n_errors = 0 len_trigrams = len(trigrams) cur.execute('select distinct g_id, s_id from labeled_data') for p in cur.fetchall(): try: # print("p[0] = {}, p[1] = {}".format(p[0], p[1])) gv_key = gh_user_tags.get(p[0]) if gv_key is not None: # print("\tgv_key = {}.".format(gv_key)) gv = np.array(vectors[gv_key]).reshape(1, -1) else: gv = np.array([0] * len_trigrams).reshape(1, -1) sv_key = so_user_tags.get(p[1]) if sv_key is not None: # print("\tsv_key = {}".format(sv_key)) sv = np.array(vectors[sv_key]).reshape(1, -1) else: sv = np.array([0] * len_trigrams).reshape(1, -1) # if gv_key is not None and sv_key is not None: # print("p[0] = {}, p[1] = {}".format(p[0], p[1])) # print("\tgv_key = {}. sv_key = {}".format(gv_key, sv_key)) except Exception as ex: n_errors += 1 print("\tError: {}".format(ex)) continue sim = cosine_similarity(gv, sv) cur.execute(''' insert into similarities_among_tags values (%s, %s, %s) ''', (p[0], p[1], sim[0][0])) print("similarities_among_tags processed. {} names not found in vectors dictionary".format(n_errors)) print("Close connection") con.commit() cur.close() con.close() print("=======End=======")
def makePrediction(cfg, model, features, n_samples, file_append=None, delete_old_data=True, save_to_file=False): print("\n===========\nRUNNING makePrediction()\n===========\n") print("model: {}. n_samples: {}. save_to_file: {}".format(model, n_samples, save_to_file)) if file_append is None: file_append = "with_tags" if "tags" in features else "without_tags" ### model selection if not os.path.isdir(root_dir + "models"): raise Exception("You need to run learning/learn.py first before running this file") con, cur = getDbConnection(cfg) if delete_old_data: cur.execute("delete from predictions where model = '{}'".format(model)) con.commit() ### Candidate generation: test data print("Candidate generation: # test data = {}".format(n_samples)) sims = {} for attr in features: sims[attr] = {} t = threading.Thread(target=similarity, args=(attr, con, sims)) t.start() main_thread = threading.current_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() print("All thread returned") cur.execute(''' select g_id, s_id from labeled_data_test order by g_id, s_id ''') pairs = [(c[0], c[1]) for c in cur.fetchall()] print("Generate input data matrix") S = lil_matrix((n_samples * n_samples, len(features))) for attr in sims.keys(): for c in sims[attr]: if not math.isnan(sims[attr][c]) and (c[0], c[1]) in pairs: S[pairs.index((c[0], c[1])), features.index(attr)] = sims[attr][c] S = S.toarray() print("Load and predict") clf = joblib.load(get_model_path(model, file_append)) scores = clf.predict_proba(S) print("Done. scores - shape: {}. Classes: {}".format(scores.shape, clf.classes_)) cc = 0 for p in zip(pairs, scores): # pred = clf.classes_[0] if p[1][0] >= 0.5 else clf.classes_[1] pred = clf.classes_[1] if p[1][1] > 0.5 else clf.classes_[0] proba = p[1][0] if pred == clf.classes_[0] else p[1][1] query = "insert into predictions values ({},{},'{}',{},{})".format(p[0][0], p[0][1], model, pred, proba) cur.execute(query) cc += 1 if cc % 10: con.commit() con.commit() if save_to_file: print("Saving predictions to file") cur.execute("select g_id, s_id, pred, proba from predictions where model = '{}'".format(model)) with open(root_dir + "data/{}/predicted_{}.tsv".format(file_append, model), 'w') as w: w.write("g_id\ts_id\tpred\tproba\n") for row in cur.fetchall(): w.write("\t".join([str(r) for r in row]) + "\n") cur.close() con.close() print("==========End makePrediction() for model: {}==========".format(model))
def startLearning(cfg, file_append): print("\n===========\nRUNNING startLearning()\n===========\n") model_dir = "{}models/{}".format(root_dir, file_append) if not os.path.isdir(model_dir): os.makedirs(model_dir) con, cur = getDbConnection(cfg) ### Preparing labels of pairs print("Preparing labels of pairs") cur.execute(''' select g_id, s_id, label from labeled_data order by g_id, s_id ''') label_list = [x[2] for x in cur.fetchall()] labels = lil_matrix((len(label_list), 1)) for i in range(len(label_list)): labels[i, 0] = label_list[i] labels = labels.toarray().ravel() ### Load similarity matrix print("Load similarity matrix") S = io.mmread(root_dir + 'data/{}.s.mtx'.format(file_append)) S = S.toarray() ### Learn linear regression classifier print("Learn linear regression classifier") clf = LinearRegression() clf.fit(S, labels) joblib.dump(clf, model_dir + '/lr.pkl') ### Learn kNN classifier print("Learn kNN classifier") clf = KNeighborsClassifier() clf.fit(S, labels) joblib.dump(clf, model_dir + '/knn.pkl') ### Learn logistic regression classifier print("Learn logistic regression classifier") clf = LogisticRegression() clf.fit(S, labels) joblib.dump(clf, model_dir + '/lg.pkl') ### Learn random forest classifier print("Learn random forest classifier") clf = RandomForestClassifier(n_estimators=100) clf.fit(S, labels) joblib.dump(clf, model_dir + '/rf.pkl') ### Learn gradient boosting decision tree classifier print("Learn gradient boosting decision tree classifier") clf = GradientBoostingClassifier(n_estimators=100) clf.fit(S, labels) joblib.dump(clf, model_dir + '/gbdt.pkl') cur.close() con.close() if bool(cfg["zipModel"]): try: print("\nDone. Now zip the models") import zipfile with zipfile.ZipFile(model_dir + "_models.zip", "w", zipfile.ZIP_DEFLATED) as zf: abs_src = os.path.abspath(model_dir) for dirname, subdirs, files in os.walk(model_dir): for filename in files: absname = os.path.abspath( os.path.join(dirname, filename)) arcname = absname[len(abs_src) + 1:] print('zipping %s as %s' % (os.path.join(dirname, filename), arcname)) zf.write(absname, arcname) except Exception as ex: print(ex) print("That was error zipping file. Continuing...") print("===========End startLearning()============")
def generateNegativeDataPairs(cfg, redo=False): print("\n===========\nRUNNING generateNegativeDataPairs()\n===========\n") ### Connect to database con, cur = getDbConnection(cfg) if redo: cur.execute('delete from negative_user_pairs') con.commit() else: # Check if done before cur.execute('select gh_user_id from negative_user_pairs limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("negative_user_pairs has already been generated") return # check if common_users table has been populated print("check if gh_so_common_users table has been populated") cur.execute('select gh_user_id from gh_so_common_users limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) == 0: import csv import os print("populate gh_so_common_users table") root_dir = os.path.join(os.path.dirname(__file__), "../") with open(root_dir + "data/common_users.csv", "r") as f: reader = csv.reader(f) values = ["({},{})".format(row[0], row[1]) for row in reader] cur.execute( 'insert into gh_so_common_users (gh_user_id, so_user_id) values {}' .format(values)) con.commit() print("gh_so_common_users table populated") print("Generating negative_user_pairs...") ### Load GH users (positive) cur.execute('select distinct gh_user_id from gh_so_common_users') gh_users = [r[0] for r in cur.fetchall()] ### Load SO users (positive) cur.execute('select distinct so_user_id from gh_so_common_users') so_users = [r[0] for r in cur.fetchall()] ### Load SO users (negative) cur.execute( ''' select * from ( select id from so_users except select so_user_id from gh_so_common_users ) as a order by random() limit %s ''', (len(gh_users), )) ### Pairing with GH positive users and SO negative users, ### generate nagative samples w.r.t. GH positive users out = [ '(%d, %d)' % p for p in zip(gh_users, [r[0] for r in cur.fetchall()]) ] cur.execute(''' insert into negative_user_pairs (gh_user_id, so_user_id) values %s ''' % ','.join(out)) ### Load GH users (negative) cur.execute( ''' select * from ( select id from users except select gh_user_id from gh_so_common_users ) as a order by random() limit %s ''', (len(so_users), )) ### Pairing with GH positive users and SO negative users, ### generate nagative samples w.r.t. GH positive users out = [ '(%d, %d)' % p for p in zip(so_users, [r[0] for r in cur.fetchall()]) ] cur.execute(''' insert into negative_user_pairs values %s ''' % ','.join(out)) con.commit() print("Done generating negative_user_pairs.") cur.close() con.close() print("========End generateNegativeDataPairs()=======")
def generateDescCommentSimilarity(cfg, redoSimilarity=False): print("\n===========\nRUNNING generateDescCommentSimilarity()\n===========\n") con, cur = getDbConnection(cfg) # check if done before if redoSimilarity: cur.execute('delete from similarities_among_desc_comment') con.commit() else: cur.execute('select g_id from similarities_among_desc_comment limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("similarities_among_desc_comment has already been generated") return print("created table similarities_among_desc_comment") ### Load user info of GitHub g_users = loadGithubProjectDescription(cur, "labeled_data") ### Load user info of Stack Overflow cur.execute(''' select distinct l.s_id, u.text from so_comments u, labeled_data l where u.text != '' and u.user_id = l.s_id ''') s_users = {} for c in cur.fetchall(): if c[0] in s_users: s_users[c[0]] += " " + c[1] else: s_users[c[0]] = c[1] ### TF-IDF computation distances, g_key_indices, s_key_indices = tfidfSimilarities(g_users, s_users) print("shape - distances: {}.\n".format(distances.shape)) ### store similarities cur.execute(''' select distinct l.g_id, l.s_id from user_project_description g, labeled_data l, so_comments s where g.description != '' and g.user_id = l.g_id and s.text != '' and s.user_id = l.s_id ''') good = 0 bad = 0 for p in cur.fetchall(): g_ind = g_key_indices.get(p[0]) s_ind = s_key_indices.get(p[1]) if g_ind is not None and s_ind is not None: distance = distances[g_ind][s_ind] # print("\t1-similarity_val: {}".format(1 - distance)) good += 1 else: # print("p[0]: {}, p[1]: {}".format(p[0], p[1])) # print("\tg_ind: {}, s_ind: {}".format(g_ind, s_ind)) bad += 1 continue cur.execute(''' insert into similarities_among_desc_comment values (%s, %s, %s) ''', (p[0], p[1], 1 - distance)) print("Close connection") con.commit() cur.close() con.close() print("\nAll done. #good ones: {}, #bad ones: {}".format(good, bad)) print("=======End generateDescCommentSimilarity()=======")
def main(): con, cur = getDbConnection(cfg) ### TF-IDF vectorizer tfidf = TfidfVectorizer(stop_words='english') ### Load user info of GitHub cur.execute(''' select distinct l.g_id, u.description from user_project_description u, labeled_data l where u.description != '' and u.user_id = l.g_id ''') g_users = {} for c in cur.fetchall(): if c[0] in g_users: g_users[c[0]] += " " + c[1] else: g_users[c[0]] = c[1] g_keys = g_users.keys() ### post contents modes = ['body', 'title', 'tags'] for mode in modes: ### create table for description-vs-post-x similarity cur.execute(''' create table similarities_among_desc_p%s (g_id int, s_id int, similarity float8, primary key(g_id, s_id)) ''' % mode) ### Load user info of Stack Overflow cur.execute(''' select distinct l.s_id, u.text from so_user_post_%s u, labeled_data l where u.text != '' and u.user_id = l.s_id ''' % mode) s_users = {} for c in cur.fetchall(): if c[0] in g_users: s_users[c[0]] += " " + c[1] else: s_users[c[0]] = c[1] s_keys = s_users.keys() ### TF-IDF computation values = [] values.extend(g_users.values()) values.extend(s_users.values()) vecs = tfidf.fit_transform(values) ### TF-IDF vectors of GH users g_vecs = vecs[:len(g_keys), :] ### TF-IDF vectors of SO users s_vecs = vecs[len(g_keys):, :] ### similarities between vectors sims = pairwise_distances(g_vecs, s_vecs, metric='cosine') ### store similarities cur.execute(''' select distinct l.g_id, l.s_id from user_project_description g, labeled_data l, so_user_post_%s s where g.description != '' and g.user_id = l.g_id and s.text != '' and s.user_id = l.s_id ''' % mode) pairs = cur.fetchall() for p in pairs: cur.execute(''' insert into similarities_among_desc_p%s values (%d, %d, %s) ''' % (mode, p[0], p[1], str(1 - sims[g_keys.index(p[0])][s_keys.index(p[1])]))) con.commit() con.commit() cur.close() con.close()
def generateNameSimilarity(cfg, redoSimilarity=False): print("\n===========\nRUNNING generateNameSimilarity()\n===========\n") con, cur = getDbConnection(cfg) # check if done before if redoSimilarity: cur.execute('delete from similarities_among_user_names') con.commit() else: cur.execute('select g_id from similarities_among_user_names limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("similarities_among_user_names has already been generated") return print("created table similarities_among_user_names") cur.execute(''' drop table if exists similarities_among_names; create temp table similarities_among_names (g_name text, s_name text, similarity float8); ''') print("created table similarities_among_names") # names of GH users cur.execute(''' select distinct g_name as username from labeled_data union select distinct s_name as username from labeled_data ''') gh_and_so_users = [r[0] for r in cur.fetchall()] print("Combined users list. Total: {}".format(len(gh_and_so_users))) name_trigrams, trigrams = buildTrigram(gh_and_so_users) vectors = vectorizeNamesByTrigram(trigrams, name_trigrams) print( "Constructed vectors for vectorizing names by trigram. length of vectors dict: {}" .format(len(vectors))) # similarities between names cur.execute(''' select distinct g_name, s_name from labeled_data where g_name != '' and s_name != '' ''') n_errors = 0 pairs = cur.fetchall() for p in pairs: try: gv = np.array(vectors[p[0]]).reshape(1, -1) sv = np.array(vectors[p[1]]).reshape(1, -1) except: n_errors += 1 continue sim = cosine_similarity(gv, sv) cur.execute( ''' insert into similarities_among_names values (%s, %s, %s) ''', (p[0], p[1], sim[0][0])) con.commit() print( "similarities_among_names processed. {} names not found in vectors dictionary" .format(n_errors)) ### store similarities between GH and SO users cur.execute(''' insert into similarities_among_user_names select g.id, s.id, c.similarity from users g, so_users s, similarities_among_names c where g.name = c.g_name and s.display_name = c.s_name ''') print("Delete temp table: similarities_among_names") cur.execute('drop table if exists similarities_among_names') con.commit() print("similarities_among_user_names processed") print("Close connection") cur.close() con.close() print("=======End=======")
def generateLocationSimilarity(cfg, redoSimilarity=False): print("\n===========\nRUNNING generateLocationSimilarity()\n===========\n") con, cur = getDbConnection(cfg) # check if done before if redoSimilarity: cur.execute('delete from similarities_among_locations') con.commit() else: cur.execute('select g_id from similarities_among_locations limit 1') existing = [r[0] for r in cur.fetchall()] if len(existing) > 0: print("similarities_among_locations has already been generated") return ### Load user info of GitHub cur.execute(''' select distinct l.g_id, u.location from labeled_data l, users u where l.g_id = u.id and u.location != '' ''') g_users = {} for c in cur.fetchall(): g_users[c[0]] = c[1] ### Load user info of Stack Overflow cur.execute(''' select distinct l.s_id, u.location from labeled_data l, so_users u where l.s_id = u.id and u.location != '' ''') s_users = {} for c in cur.fetchall(): s_users[c[0]] = c[1] ### TF-IDF computation distances, g_key_indices, s_key_indices = tfidfSimilarities(g_users, s_users) print("shape - distances: {}.\n".format(distances.shape)) ### store similarities cur.execute(''' select distinct g_id, s_id from labeled_data l, users g, so_users s where l.g_id = g.id and l.s_id = s.id and g.location != '' and s.location != '' ''') good = 0 bad = 0 for p in cur.fetchall(): # print("p[0]: {}, p[1]: {}".format(p[0], p[1])) g_ind = g_key_indices.get(p[0]) s_ind = s_key_indices.get(p[1]) if g_ind is not None and s_ind is not None: distance = distances[g_ind][s_ind] # print("\t1-similarity_val: {}".format(1 - distance)) good += 1 else: # print("\tg_ind: {}, s_ind: {}".format(g_ind, s_ind)) bad += 1 continue cur.execute(''' insert into similarities_among_locations values (%s, %s, %s) ''', (p[0], p[1], 1 - distance)) print("Close connection") con.commit() cur.close() con.close() print("\nAll done. #good ones: {}, #bad ones: {}".format(good, bad)) print("=======End=======")