def write_probabilities(self, train_data, test_data): """ Writes log probabilities to file. To be called only with a fitted model. :param train_data: unsliced train data, including 'repo_name' and 'label' :param test_data: unsliced test data, including 'repo_name' and 'label' """ train_data_trees = pd.DataFrame(data=self.select_features(train_data)) train_data_trees['blob_paths_updated'] = train_data_trees.apply( lambda row: self.row_to_words(row), axis=1) x_train = self.cV.transform(train_data_trees['blob_paths_updated']) train_repo_names = train_data['repo_name'] train_labels = train_data['label'] Helper().write_probabilities(self.clf, x_train, train_repo_names, train_labels, 'prob/prob_%s_train' % 'trees') test_data_trees = pd.DataFrame(data=self.select_features(test_data)) test_data_trees['blob_paths_updated'] = test_data_trees.apply( lambda row: self.row_to_words(row), axis=1) x_test = self.cV.transform(test_data_trees['blob_paths_updated']) test_repo_names = test_data['repo_name'] test_labels = test_data['label'] Helper().write_probabilities(self.clf, x_test, test_repo_names, test_labels, 'prob/prob_%s_test' % 'trees')
def getReadmes(self, label, keyword): names = self.repos_names_search % (label, label, keyword) folder = self.readmes_repos_folder % label with open(names, 'r') as file: repos = file.readlines() for repo in repos: filename = Helper().build_path_from_folder_and_repo_name( repo, folder, MD_README_FILE_NAME) if os.path.exists(filename): print 'exists' continue r = requests.get("https://api.github.com/repos/" + repo[:-1] + "/readme", auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 200: filename = Helper().build_path_from_folder_and_repo_name( repo, folder, MD_README_FILE_NAME) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) with open(filename, 'w') as file: print "Writing to %s" % file.name content = r.json()['content'] decoded = content.decode('base64') file.write(decoded) file.close() else: print r.headers
def confusion_matrix(self, y_test, y_pred): """ Saves confusion matrix to file :param y_test: test labels :param y_pred: predicted labels """ confusion_m = confusion_matrix(y_test, y_pred) Helper().plot_confusion_matrix(self.input_type, confusion_m, normalize=True, classes=Labels.toArray(), title='Confusion matrix for %s classifier' % self.input_type) Helper().plot_confusion_matrix(self.input_type, confusion_m, normalize=False, classes=Labels.toArray(), title='Confusion matrix for %s classifier' % self.input_type)
def write_proba(self, dataframe_train, dataframe_test): X_train, Y_train = self.build_keyword_features( dataframe_train), self.build_labels(dataframe_train) X_test, Y_test = self.build_keyword_features( dataframe_test), self.build_labels(dataframe_test) Helper().write_probabilities(self.clf, X_train, dataframe_train['repo_name'], dataframe_train['label'], 'prob/prob_keyword_train') Helper().write_probabilities(self.clf, X_test, dataframe_test['repo_name'], dataframe_test['label'], 'prob/prob_keyword_test')
def write_probabilities(self, train_data, test_data): """ Writes log probabilities to file :param train_data: unsliced train data, including 'repo_name' and 'label' :param test_data: unsliced test data, including 'repo_name' and 'label' """ X = self.select_features(train_data) train_repo_names = train_data['repo_name'] train_labels = train_data['label'] Helper().write_probabilities(self.clf, X, train_repo_names, train_labels, 'prob/prob_%s_train' % self.input_type) Y = self.select_features(test_data) test_repo_names = test_data['repo_name'] test_labels = test_data['label'] Helper().write_probabilities(self.clf, Y, test_repo_names, test_labels, 'prob/prob_%s_test' % self.input_type)
def get_contents(self, label, keyword): names = self.repos_names_search % (label, label, keyword) folder = self.contents_repos_folder % label with open(names, 'r') as file: repos = file.readlines() for repo in repos: filename = Helper().build_path_from_folder_and_repo_name( repo, folder, JSON_CONTENTS_FILE_NAME) if os.path.exists(filename): print filename, " exists" continue r = requests.get("https://api.github.com/repos/" + repo[:-1] + "/contents", auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 200: if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) with open(filename, 'w') as file: print "Writing to %s" % file.name contents = json.dumps((r.json())) file.write(contents) file.close() else: print r.headers
def update_existing(self): file = open(self.output_file, 'r') lines = file.readlines() for line in lines: repo_link = line.split(" ")[0] assigned_label = line.split(" ")[1].rstrip('\n') labelled_class = open("labelledd_%s" % assigned_label, 'a') labelled_class.write( Helper().build_repo_name_from_repo_link(repo_link) + '\n')
def getCommitActivity(self, label, keyword): names = self.repos_names_search % (label, label, keyword) folder = self.commit_activity_repos_folder % label with open(names, 'r') as file: repos = file.readlines() print repos.__len__() for repo in repos: filename = Helper().build_path_from_folder_and_repo_name( repo, folder, JSON_COMMIT_ACTIVITY_FILE_NAME) if os.path.exists(filename): print filename, " exists" continue r = requests.get("https://api.github.com/repos/" + repo[:-1] + "/stats/commit_activity", auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 202: while r.status_code == 202: print "status code: ", r.status_code r = requests.get("https://api.github.com/repos/" + repo[:-1] + "/stats/commit_activity", auth=HTTPBasicAuth( self.username, self.password)) time.sleep(3) if r.status_code == 200: print "status code: ", r.status_code filename = Helper().build_path_from_folder_and_repo_name( repo, folder, JSON_COMMIT_ACTIVITY_FILE_NAME) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) with open(filename, 'w') as file: print "Writing to %s" % file.name jsonContent = json.dumps((r.json())) file.write(jsonContent) file.close() else: print r.headers
def get_repos_additional_data(self, label): names = self.additional_repos_names % label folder = self.additional_repos_folder % label with open(names, 'r') as file: repos = file.readlines() for repo in repos: repo_name = Helper().build_repo_name_from_repo_link(repo) r = requests.get("https://api.github.com/repos/" + repo_name, auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 200: filename = Helper().build_path_from_folder_and_repo_link( repo, folder, JSON_REPO_FILE_NAME) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) with open(filename, 'w') as file: print "Writing to %s" % file.name jsonContent = json.dumps((r.json())) file.write(jsonContent) file.close() else: print r.headers
def get_all_commits_additional_data(self, label): names = self.additional_repos_names % label folder = self.additional_commits_repos_folder % label query = {'per_page': 100} with open(names, 'r') as file: repos = file.readlines() print repos.__len__() for repo in repos: filename = Helper().build_path_from_folder_and_repo_link( repo, folder, JSON_COMMITS_FILE_NAME) if os.path.exists(filename): print filename, " exists" continue repo_name = Helper().build_repo_name_from_repo_link(repo) print repo_name r = requests.get("https://api.github.com/repos/" + repo_name + "/commits", params=query, auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 200: print "status code: ", r.status_code filename = Helper().build_path_from_folder_and_repo_link( repo, folder, JSON_COMMITS_FILE_NAME) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) jsonCommits = r.json() links = r.links print "commits loaded:", len(jsonCommits) while 'next' in links: next_page_url = links['next']['url'] next_page_request = requests.get(next_page_url, auth=HTTPBasicAuth( self.username, self.password)) if next_page_request.status_code == 200: jsonCommits.extend(next_page_request.json()) links = next_page_request.links print "commits loaded:", len(jsonCommits) jsonCommitsList = [] for commit in jsonCommits: author = commit['commit']['author'] committer = commit['commit']['author'] comment_count = commit['commit']['comment_count'] author_date = author['date'] committer_date = committer['date'] author_email = author['email'] committer_email = committer['email'] commit_date = { 'author_date': author_date, 'committer_date': committer_date, 'comment_count': comment_count, 'author_email': author_email, 'committer_email': committer_email } jsonCommitsList.append(commit_date) with open(filename, 'w') as file: print "Writing %d commits to %s" % ( jsonCommitsList.__len__(), file.name) jsonContent = json.dumps(jsonCommitsList) file.write(jsonContent) file.close() else: print r.headers print 'Successfully loaded commits for %d repos' % len(repos)
def get_commits_interval(self, label, keyword): names = self.repos_names_search % (label, label, keyword) folder = self.commits_interval_folder % label with open(names, 'r') as file: repos = file.readlines() print repos.__len__() for repo in repos: filename = Helper().build_path_from_folder_and_repo_name( repo, folder, JSON_COMMITS_INTERVAL) if os.path.exists(filename): print filename, " exists" continue r = requests.get("https://api.github.com/repos/" + repo[:-1] + "/commits", auth=HTTPBasicAuth(self.username, self.password)) if r.status_code == 200: if 'last' not in r.links: only_page_json_object = r.json() first_commit = only_page_json_object[0] last_commit = only_page_json_object[ len(only_page_json_object) - 1] last_page = -1 else: print r.links first_page_json_object = r.json() first_commit = first_page_json_object[0] last_page_url = r.links['last']['url'] last_page = (int)(last_page_url.split('=')[1]) print last_page if last_page != -1: req = requests.get(last_page_url, auth=HTTPBasicAuth( self.username, self.password)) if req.status_code == 200: last_page_json_object = req.json() last_commit = last_page_json_object[ len(last_page_json_object) - 1] else: print 'Page request failed' count = len(last_page_json_object) if last_page != -1: # multiply by results per page count += (last_page - 1) * 30 if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) json_interval_object = {"commits_count": count} json_interval_object['first_commit'] = first_commit json_interval_object['last_commit'] = last_commit with open(filename, 'w') as file: print "Writing commits interval and count %d to %s" % ( count, file.name) jsonContent = json.dumps(json_interval_object) file.write(jsonContent) file.close() else: print r.headers print 'Successfully loaded commits for %d repos' % len(repos)
repo_names = data['repo_name'] data = data.drop(labels='repo_name', axis=1) train_data, test_data = train_test_split(data, test_size=0.2) train_labels = train_data['label'] test_labels = test_data['label'] train_data = train_data.drop(labels='label', axis=1) test_data = test_data.drop(labels='label', axis=1) print data.shape print train_data.shape print test_data.shape forest_classifier = RandomForestClassifier(n_estimators=2000, max_depth=3) forest = forest_classifier.fit(train_data, train_labels) output = forest.predict(test_data) print mean_squared_error(output, test_labels) print accuracy_score(test_labels, output) score = precision_score(output, test_labels, average=None) # precision values high for hw and web, meaning that commit info is able to identify these classes? print score print np.mean(score) Helper().write_probabilities(forest, data, repo_names, 'prob/prob_commit_interval_data')
# precision scores much closer to one another and highly oscilating # forest_classifier = RandomForestClassifier(n_estimators=1000, max_depth=10) # below finds docs with high precision # precision average is lower: 31-42 %, sometimes even oscillates 21 - 56 # forest_classifier = RandomForestClassifier(n_estimators=1000, max_depth=5) forest_classifier = RandomForestClassifier(n_estimators=2000, max_depth=5) forest = forest_classifier.fit(train_data, train_labels) output = forest.predict(test_data) print mean_squared_error(output, test_labels) print accuracy_score(test_labels, output) score = precision_score(test_labels, output, average=None) print score print np.mean(score) Helper().write_probabilities(forest, data, repo_names, 'prob/prob_language_data') # linear_svc = LinearSVC() # rbf_svc = SVC(kernel='rbf') # compare_performance_scores(linear_svc=linear_svc, rbf_svc=rbf_svc, data=data) # tune_rbf_svc_hyperparameters(rbf_svc, data) # rbf_svc = SVC(kernel='rbf', C=1, gamma=0.01) # compare_performance_scores(linear_svc=linear_svc, rbf_svc=rbf_svc, data=data)