Ejemplo n.º 1
0
    def write_probabilities(self, train_data, test_data):
        """
        Writes log probabilities to file. To be called only with a fitted model.
        :param train_data: unsliced train data, including 'repo_name' and 'label'
        :param test_data: unsliced test data, including 'repo_name' and 'label'
        """
        train_data_trees = pd.DataFrame(data=self.select_features(train_data))
        train_data_trees['blob_paths_updated'] = train_data_trees.apply(
            lambda row: self.row_to_words(row), axis=1)
        x_train = self.cV.transform(train_data_trees['blob_paths_updated'])
        train_repo_names = train_data['repo_name']
        train_labels = train_data['label']

        Helper().write_probabilities(self.clf, x_train, train_repo_names,
                                     train_labels,
                                     'prob/prob_%s_train' % 'trees')

        test_data_trees = pd.DataFrame(data=self.select_features(test_data))
        test_data_trees['blob_paths_updated'] = test_data_trees.apply(
            lambda row: self.row_to_words(row), axis=1)
        x_test = self.cV.transform(test_data_trees['blob_paths_updated'])

        test_repo_names = test_data['repo_name']
        test_labels = test_data['label']
        Helper().write_probabilities(self.clf, x_test, test_repo_names,
                                     test_labels,
                                     'prob/prob_%s_test' % 'trees')
Ejemplo n.º 2
0
    def getReadmes(self, label, keyword):
        names = self.repos_names_search % (label, label, keyword)
        folder = self.readmes_repos_folder % label

        with open(names, 'r') as file:
            repos = file.readlines()
        for repo in repos:
            filename = Helper().build_path_from_folder_and_repo_name(
                repo, folder, MD_README_FILE_NAME)
            if os.path.exists(filename):
                print 'exists'
                continue

            r = requests.get("https://api.github.com/repos/" + repo[:-1] +
                             "/readme",
                             auth=HTTPBasicAuth(self.username, self.password))
            if r.status_code == 200:
                filename = Helper().build_path_from_folder_and_repo_name(
                    repo, folder, MD_README_FILE_NAME)
                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))
                with open(filename, 'w') as file:
                    print "Writing to %s" % file.name
                    content = r.json()['content']
                    decoded = content.decode('base64')
                    file.write(decoded)
                    file.close()
            else:
                print r.headers
Ejemplo n.º 3
0
    def confusion_matrix(self, y_test, y_pred):
        """
        Saves confusion matrix to file

        :param y_test: test labels
        :param y_pred: predicted labels
        """
        confusion_m = confusion_matrix(y_test, y_pred)

        Helper().plot_confusion_matrix(self.input_type, confusion_m, normalize=True, classes=Labels.toArray(),
                                       title='Confusion matrix for %s classifier' % self.input_type)
        Helper().plot_confusion_matrix(self.input_type, confusion_m, normalize=False, classes=Labels.toArray(),
                                       title='Confusion matrix for %s classifier' % self.input_type)
Ejemplo n.º 4
0
    def write_proba(self, dataframe_train, dataframe_test):
        X_train, Y_train = self.build_keyword_features(
            dataframe_train), self.build_labels(dataframe_train)
        X_test, Y_test = self.build_keyword_features(
            dataframe_test), self.build_labels(dataframe_test)

        Helper().write_probabilities(self.clf, X_train,
                                     dataframe_train['repo_name'],
                                     dataframe_train['label'],
                                     'prob/prob_keyword_train')
        Helper().write_probabilities(self.clf, X_test,
                                     dataframe_test['repo_name'],
                                     dataframe_test['label'],
                                     'prob/prob_keyword_test')
Ejemplo n.º 5
0
    def write_probabilities(self, train_data, test_data):
        """
        Writes log probabilities to file
        :param train_data: unsliced train data, including 'repo_name' and 'label'
        :param test_data: unsliced test data, including 'repo_name' and 'label'
        """
        X = self.select_features(train_data)
        train_repo_names = train_data['repo_name']
        train_labels = train_data['label']
        Helper().write_probabilities(self.clf, X, train_repo_names, train_labels,
                                     'prob/prob_%s_train' % self.input_type)

        Y = self.select_features(test_data)
        test_repo_names = test_data['repo_name']
        test_labels = test_data['label']
        Helper().write_probabilities(self.clf, Y, test_repo_names, test_labels, 'prob/prob_%s_test' % self.input_type)
Ejemplo n.º 6
0
    def get_contents(self, label, keyword):
        names = self.repos_names_search % (label, label, keyword)
        folder = self.contents_repos_folder % label

        with open(names, 'r') as file:
            repos = file.readlines()
        for repo in repos:
            filename = Helper().build_path_from_folder_and_repo_name(
                repo, folder, JSON_CONTENTS_FILE_NAME)
            if os.path.exists(filename):
                print filename, " exists"
                continue

            r = requests.get("https://api.github.com/repos/" + repo[:-1] +
                             "/contents",
                             auth=HTTPBasicAuth(self.username, self.password))
            if r.status_code == 200:

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))
                with open(filename, 'w') as file:
                    print "Writing to %s" % file.name
                    contents = json.dumps((r.json()))
                    file.write(contents)
                    file.close()
            else:
                print r.headers
    def update_existing(self):
        file = open(self.output_file, 'r')
        lines = file.readlines()
        for line in lines:
            repo_link = line.split(" ")[0]
            assigned_label = line.split(" ")[1].rstrip('\n')

            labelled_class = open("labelledd_%s" % assigned_label, 'a')
            labelled_class.write(
                Helper().build_repo_name_from_repo_link(repo_link) + '\n')
Ejemplo n.º 8
0
    def getCommitActivity(self, label, keyword):
        names = self.repos_names_search % (label, label, keyword)
        folder = self.commit_activity_repos_folder % label
        with open(names, 'r') as file:
            repos = file.readlines()
            print repos.__len__()
        for repo in repos:
            filename = Helper().build_path_from_folder_and_repo_name(
                repo, folder, JSON_COMMIT_ACTIVITY_FILE_NAME)

            if os.path.exists(filename):
                print filename, " exists"
                continue

            r = requests.get("https://api.github.com/repos/" + repo[:-1] +
                             "/stats/commit_activity",
                             auth=HTTPBasicAuth(self.username, self.password))

            if r.status_code == 202:
                while r.status_code == 202:
                    print "status code: ", r.status_code
                    r = requests.get("https://api.github.com/repos/" +
                                     repo[:-1] + "/stats/commit_activity",
                                     auth=HTTPBasicAuth(
                                         self.username, self.password))
                    time.sleep(3)

            if r.status_code == 200:
                print "status code: ", r.status_code
                filename = Helper().build_path_from_folder_and_repo_name(
                    repo, folder, JSON_COMMIT_ACTIVITY_FILE_NAME)

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))
                with open(filename, 'w') as file:
                    print "Writing to %s" % file.name
                    jsonContent = json.dumps((r.json()))
                    file.write(jsonContent)
                    file.close()
            else:
                print r.headers
Ejemplo n.º 9
0
    def get_repos_additional_data(self, label):
        names = self.additional_repos_names % label
        folder = self.additional_repos_folder % label

        with open(names, 'r') as file:
            repos = file.readlines()

        for repo in repos:
            repo_name = Helper().build_repo_name_from_repo_link(repo)
            r = requests.get("https://api.github.com/repos/" + repo_name,
                             auth=HTTPBasicAuth(self.username, self.password))
            if r.status_code == 200:
                filename = Helper().build_path_from_folder_and_repo_link(
                    repo, folder, JSON_REPO_FILE_NAME)
                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))
                with open(filename, 'w') as file:
                    print "Writing to %s" % file.name
                    jsonContent = json.dumps((r.json()))
                    file.write(jsonContent)
                    file.close()
            else:
                print r.headers
Ejemplo n.º 10
0
    def get_all_commits_additional_data(self, label):
        names = self.additional_repos_names % label
        folder = self.additional_commits_repos_folder % label
        query = {'per_page': 100}

        with open(names, 'r') as file:
            repos = file.readlines()
            print repos.__len__()
        for repo in repos:

            filename = Helper().build_path_from_folder_and_repo_link(
                repo, folder, JSON_COMMITS_FILE_NAME)

            if os.path.exists(filename):
                print filename, " exists"
                continue

            repo_name = Helper().build_repo_name_from_repo_link(repo)
            print repo_name
            r = requests.get("https://api.github.com/repos/" + repo_name +
                             "/commits",
                             params=query,
                             auth=HTTPBasicAuth(self.username, self.password))

            if r.status_code == 200:
                print "status code: ", r.status_code
                filename = Helper().build_path_from_folder_and_repo_link(
                    repo, folder, JSON_COMMITS_FILE_NAME)

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))

                jsonCommits = r.json()
                links = r.links
                print "commits loaded:", len(jsonCommits)
                while 'next' in links:
                    next_page_url = links['next']['url']
                    next_page_request = requests.get(next_page_url,
                                                     auth=HTTPBasicAuth(
                                                         self.username,
                                                         self.password))

                    if next_page_request.status_code == 200:
                        jsonCommits.extend(next_page_request.json())
                        links = next_page_request.links
                    print "commits loaded:", len(jsonCommits)

                jsonCommitsList = []
                for commit in jsonCommits:
                    author = commit['commit']['author']
                    committer = commit['commit']['author']
                    comment_count = commit['commit']['comment_count']

                    author_date = author['date']
                    committer_date = committer['date']
                    author_email = author['email']
                    committer_email = committer['email']

                    commit_date = {
                        'author_date': author_date,
                        'committer_date': committer_date,
                        'comment_count': comment_count,
                        'author_email': author_email,
                        'committer_email': committer_email
                    }
                    jsonCommitsList.append(commit_date)

                with open(filename, 'w') as file:
                    print "Writing %d commits to %s" % (
                        jsonCommitsList.__len__(), file.name)
                    jsonContent = json.dumps(jsonCommitsList)
                    file.write(jsonContent)
                    file.close()

            else:
                print r.headers

        print 'Successfully loaded commits for %d repos' % len(repos)
Ejemplo n.º 11
0
    def get_commits_interval(self, label, keyword):
        names = self.repos_names_search % (label, label, keyword)
        folder = self.commits_interval_folder % label

        with open(names, 'r') as file:
            repos = file.readlines()
            print repos.__len__()
        for repo in repos:
            filename = Helper().build_path_from_folder_and_repo_name(
                repo, folder, JSON_COMMITS_INTERVAL)

            if os.path.exists(filename):
                print filename, " exists"
                continue
            r = requests.get("https://api.github.com/repos/" + repo[:-1] +
                             "/commits",
                             auth=HTTPBasicAuth(self.username, self.password))

            if r.status_code == 200:
                if 'last' not in r.links:
                    only_page_json_object = r.json()
                    first_commit = only_page_json_object[0]
                    last_commit = only_page_json_object[
                        len(only_page_json_object) - 1]
                    last_page = -1
                else:
                    print r.links
                    first_page_json_object = r.json()
                    first_commit = first_page_json_object[0]
                    last_page_url = r.links['last']['url']
                    last_page = (int)(last_page_url.split('=')[1])
                    print last_page

                if last_page != -1:
                    req = requests.get(last_page_url,
                                       auth=HTTPBasicAuth(
                                           self.username, self.password))
                    if req.status_code == 200:
                        last_page_json_object = req.json()
                        last_commit = last_page_json_object[
                            len(last_page_json_object) - 1]
                    else:
                        print 'Page request failed'

                count = len(last_page_json_object)
                if last_page != -1:
                    # multiply by results per page
                    count += (last_page - 1) * 30

                if not os.path.exists(os.path.dirname(filename)):
                    os.makedirs(os.path.dirname(filename))

                json_interval_object = {"commits_count": count}
                json_interval_object['first_commit'] = first_commit
                json_interval_object['last_commit'] = last_commit

                with open(filename, 'w') as file:
                    print "Writing commits interval and count %d to %s" % (
                        count, file.name)
                    jsonContent = json.dumps(json_interval_object)
                    file.write(jsonContent)
                    file.close()
            else:
                print r.headers

        print 'Successfully loaded commits for %d repos' % len(repos)
Ejemplo n.º 12
0
repo_names = data['repo_name']
data = data.drop(labels='repo_name', axis=1)


train_data, test_data = train_test_split(data, test_size=0.2)

train_labels = train_data['label']
test_labels = test_data['label']

train_data = train_data.drop(labels='label', axis=1)
test_data = test_data.drop(labels='label', axis=1)

print data.shape
print train_data.shape
print test_data.shape

forest_classifier = RandomForestClassifier(n_estimators=2000, max_depth=3)
forest = forest_classifier.fit(train_data, train_labels)

output = forest.predict(test_data)

print mean_squared_error(output, test_labels)
print accuracy_score(test_labels, output)
score = precision_score(output, test_labels, average=None)

# precision values high for hw and web, meaning that commit info is able to identify these classes?
print score
print np.mean(score)

Helper().write_probabilities(forest, data, repo_names, 'prob/prob_commit_interval_data')
Ejemplo n.º 13
0
# precision scores much closer to one another and highly oscilating
# forest_classifier = RandomForestClassifier(n_estimators=1000, max_depth=10)

# below finds docs with high precision
# precision average is lower: 31-42 %, sometimes even oscillates 21 - 56
# forest_classifier = RandomForestClassifier(n_estimators=1000, max_depth=5)

forest_classifier = RandomForestClassifier(n_estimators=2000, max_depth=5)

forest = forest_classifier.fit(train_data, train_labels)

output = forest.predict(test_data)

print mean_squared_error(output, test_labels)
print accuracy_score(test_labels, output)
score = precision_score(test_labels, output, average=None)
print score
print np.mean(score)

Helper().write_probabilities(forest, data, repo_names,
                             'prob/prob_language_data')

# linear_svc = LinearSVC()
# rbf_svc = SVC(kernel='rbf')

# compare_performance_scores(linear_svc=linear_svc, rbf_svc=rbf_svc, data=data)
# tune_rbf_svc_hyperparameters(rbf_svc, data)

# rbf_svc = SVC(kernel='rbf', C=1, gamma=0.01)
# compare_performance_scores(linear_svc=linear_svc, rbf_svc=rbf_svc, data=data)