else:
                k += 1
        # clear pos
        del pos[:]

    return results


def convert_day_to_period(day):
    _d = datetime.strptime(day, '%Y-%m-%d')
    day_as_period = datetime.strftime(_d, '%m-%Y')
    return day_as_period


if __name__ == '__main__':
    logger = loggingcfg.initialize_logger('SZZ')

    header = ["user_id", "project", "day", "num_commits", "num_file_touches", "num_src_file_touches",
              "num_files_touched", "num_src_files_touched", "loc_added", "src_loc_added", "loc_deleted",
              "src_loc_deleted", "num_bugs_induced", "num_bug_inducing_commits"]

    idx_user_id = header.index("user_id")
    idx_project = header.index("project")
    idx_day = header.index("day")
    idx_num_commits = header.index("num_commits")
    idx_num_file_touches = header.index("num_file_touches")
    idx_num_files_touched = header.index("num_files_touched")
    idx_loc_added = header.index("loc_added")
    idx_loc_deleted = header.index("loc_deleted")
    idx_num_src_file_touches = header.index("num_src_file_touches")
    idx_num_src_files_touched = header.index("num_src_files_touched")
from github.GithubException import RateLimitExceededException
from github.GithubException import GithubException
from github.GithubException import BadCredentialsException
from github.GithubException import UnknownObjectException

from githubutils.BaseGithubThreadedExtractor import BaseGitHubThreadedExtractor
from githubutils.Tokens import Tokens
from githubutils.NoAvailableTokenException import NoAvailableTokenException

from loggingcfg import initialize_logger
import numpy as np
from numba import jit, prange
import pandas as pd
import loggingcfg

logger = loggingcfg.initialize_logger('SZZ-EXTRACTOR')


class IssuesAndCommentExtractor(BaseGitHubThreadedExtractor):
    __ISSUES_COLUMN_NAMES = ["SLUG", "ID", "NUMBER", "STATE", "CREATED_AT", "CLOSED_AT", "CREATED_BY_LOGIN",
                             "CLOSED_BY_LOGIN", "ASSIGNEE_LOGIN", "TITLE", "NUM_COMMENTS", "LABELS", "IS_PL"]

    __COMMENTS_COLUMN_NAMES = ["SLUG", "ISSUE_ID", "ISSUE_NUMBER", "COMMENT_ID", "BODY", "CREATED_AT", "UPDATED_AT",
                               "USER_LOGIN", "USER_ID"]

    @jit(parallel=True)
    def __to_df(self, issues, slug, g, issue_data, comment_data):
        repo = g.get_repo(slug)

        for i in prange(0, np.size(issues)):
            logger.debug("Looking for issue number %d", issues[i])
Esempio n. 3
0

def random_selection(population, conf, seed):
    random.seed(seed)
    random.shuffle(population)
    k = sample_size(population_size=len(population), confidence_level=conf, confidence_interval=1.0)
    if len(population) < k:
        k = len(population)
    selection = random.sample(population, k)
    return selection


if __name__ == '__main__':
    import logging

    logger = loggingcfg.initialize_logger(name='SZZ:PROLIFICS', console_level=logging.INFO)

    prolific_infile = "prolifics.txt"
    #prolific_ids = None
    prolific_prj = None
    projects_outfile = "project-list"
    seed = 895
    conf_level = .95
    subsample = False

    processed_ids_filename = "tmp/prolific/temp.ids"
    processed_prj_filename = "tmp/prolific/temp.projects"
    seen_ids = set()

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hi:o:c:s:r", ["in=", "out=", "conflev=", "seed=", "random", "help"])
from orm.tables import Commit, Repo
from orm.issue_comments import IssueComment

regex = r'(\S+\/\S+)(#\d+|@\d+)'


def replace(_str):
    _ref = _str.replace("(", "")
    _ref = _ref.replace(")", "")
    _ref = _ref.replace("[", "")
    _ref = _ref.replace("]", "")
    return _ref


if __name__ == '__main__':
    logger = initialize_logger()

    session = SessionWrapper.new(init=True)
    logger.info("Connected to db")

    logger.info("Retrieving comments from commits")
    commit_messages = session.query(
        Repo.slug, Commit.message).filter(Repo.id == Commit.repo_id).all()
    logger.info("Extracting cross refs from commit messages")
    idx = 0
    for cm in commit_messages:
        cross_refs = re.finditer(regex, cm.message, re.MULTILINE)
        for ref in cross_refs:
            if cm.slug not in ref.group(0):
                _ref = replace(ref.group(0))
                cref = CrossReference(from_slug=cm.slug,
Esempio n. 5
0
        prj_writer.writerow(row)
    prj_writer.close()
    logger.info("Done writing %s." % prj_outfile)

    lang_writer = CsvWriter(csv_file=lang_outfile, mode='w')
    """
    - user_language_date_totalcommits.csv
    #user_id;language;date;num_commits;num_file_touches;num_files_touched;loc_added;loc_deleted
    #2;c;2013-09-29;1;1;1;8;0
    """
    lang_header = ['user_id', 'language', 'day', 'num_commits', 'num_src_file_touches', 'num_src_files_touched',
                   'src_loc_added', 'src_loc_deleted']
    lang_writer.writerow(lang_header)
    for row in lang_rows:
        lang_writer.writerow(row)
    lang_writer.close()
    logger.info("Done writing %s." % lang_outfile)


if __name__ == '__main__':
    loggingcfg.initialize_logger()
    target = os.path.abspath(sys.argv[1] + os.path.sep + 'idm' + os.path.sep + 'dict' + os.path.sep + 'aliasMap.dict')
    if os.path.getsize(target) > 0:
        with open(target, "rb") as f:
            unpickler = pickle.Unpickler(f)
            alias_map = unpickler.load()

        bc = BasicFileTypeClassifier()

        export(sys.argv[2:], alias_map, bc)
Esempio n. 6
0
            except Exception as e:
                traceback.print_exc(e)
                logger.error(comment, e)
                continue

        session.commit()
        logger.info("New comments added to the database: %s" % str(idx - 1))


if __name__ == '__main__':
    project_file = None
    issue_file = None
    comment_file = None
    into_db = False

    logger = initialize_logger(name="SZZ:ISSUES_COMMENTS")

    try:
        if not sys.argv[1:]:
            raise getopt.GetoptError(
                'No arguments passed from the command line. See help instructions.'
            )
        opts, args = getopt.getopt(sys.argv[1:], "hf:i:c:",
                                   ["from=", "issues=", "comments=", "help"])
        for opt, arg in opts:
            if opt in ("-h", "--help"):
                print(
                    'Usage:\n szzExtractIssuesAndComments.py -f|--from=<file> -i|--issues=<file> -c|--comments=<file>'
                )
                sys.exit(0)
            elif opt in ("-f", "--from"):
Esempio n. 7
0
            if os.path.exists(dest):
                logger.info(
                    'Project {0} already available locally in {1}, performing an update.'
                    .format(slug, dest))
                try:
                    RepoCloner.pull(dest)
                    RepoCloner.update_submodules(dest)
                except:
                    logger.error(
                        "Unknown error updating git repo in folder %s" % dest)
            else:
                RepoCloner.clone(slug, destination_dir)
                logger.info('Project repository {0} cloned into {1}'.format(
                    slug, s2f))
                RepoCloner.update_submodules(dest)
            if symlink_dir:
                try:
                    sym = os.path.join(symlink_dir, s2f)
                    os.symlink(dest, sym)
                except FileExistsError:
                    logger.debug('Symlink already existing.')
                    pass

    logger.info('Done.')


if __name__ == '__main__':
    logger = initialize_logger(name='SZZ:CLONE')
    start(sys.argv[1:])
Esempio n. 8
0
    for ipr in issuepr_messages.itertuples():
        cross_refs = re.finditer(regex, getattr(ipr, "BODY"), re.MULTILINE)
        slug = getattr(ipr, "SLUG")
        for ref in cross_refs:
            if slug not in ref.group(0):
                _ref = __replace(ref.group(0))
                cross_references.append([slug, _ref, "issue/pr"])

    logger.info("Saving cross references")
    df = pd.DataFrame(cross_references, columns=["SLUG", "REF", "TYPE"])
    df.to_csv(os.path.join(input_dir_path, "cross_references.csv"),
              index=False)


if __name__ == '__main__':
    logger = initialize_logger(name="CROSS_REF")
    help_message = 'Usage:\n extractor.py -in|--input=<input_dir> -cp|--commit_pattern=<commit_pattern_file> -ip|--issues_pattern=<issues_pattern_file>'
    input_dir = None
    commit_pattern = "*blamed_commit.csv"
    issues_pattern = "*comments.csv"

    try:
        if not sys.argv[1:]:
            raise getopt.GetoptError(
                'No arguments passed from the command line. See help instructions.'
            )
        opts, args = getopt.getopt(
            sys.argv[1:], "H:in:cp:ip",
            ["input=", "commit_pattern=", "issues_pattern=", "help"])
        for opt, arg in opts:
            if opt in ("-h", "--help"):
Esempio n. 9
0
from typing import Dict
from mpi4py import MPI
from typing import List

import time
import traceback
import hashlib
import pandas as pd
import itertools
from utils import utility

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
mpisize = comm.Get_size()

log = loggingcfg.initialize_logger('SZZ-MPI', console_level=logging.INFO)


class Szz:
    __COMMIT_COLUMNS = [
        "SLUG", "SHA", "TIMESTAMP", "AUTHOR_ID", "COMMITTER_ID", "MESSAGE",
        "NUM_PARENTS", "NUM_ADDITIONS", "NUM_DELETIONS", "NUM_FILES_CHANGED",
        "FILES", "SRC_LOC_ADDED", "SRC_LOC_DELETED", "NUM_SRC_FILES_TOUCHED",
        "SRC_FILES"
    ]

    def __init__(self,
                 repo_path: str,
                 issues_file_path: str,
                 output_folder: str,
                 valid_labels: List[str],
Esempio n. 10
0
import datetime
import logging
import os
import pickle
import sys
import time
from time import strftime
from utils import utility
import pandas
import getopt

import loggingcfg
from activityclassifier import BasicFileTypeClassifier

logger = loggingcfg.initialize_logger(name='RESULT-EXPORT',
                                      console_level=logging.INFO)


def replace_alias(aliases, author_id):
    return aliases[author_id]


def parse_timestamp(date, time_unit):
    gb = None
    date_d1 = time.strptime('1990-01-01', "%Y-%m-%d")
    date_d2 = time.strptime(str(date).split(' ')[0], "%Y-%m-%d")

    year_d1 = int((strftime("%Y", date_d1)))
    year_d2 = int((strftime("%Y", date_d2)))

    if time_unit == 'week':