def main(): cli_args = Config.parse_arguments() # A config file must be provided, or else nothing will work. if not hasattr(cli_args, 'config_file') or not cli_args.config_file: Log.error("A config file must be specified!") return Config.parse_config(cli_args.config_file) Log.config() Log.info("Started. Creating database") DB.create_db() db_session = DB.create_session() miner = RepositoryMiner( Config.repository_path, db_session=db_session, branch=Config.repository_branch ) repository = miner.repository_orm IssueScanner.assign_issue_tracking( repository, Config.issue_tracking_system, Config.issue_tracking_url, Config.issue_tracking_username, Config.issue_tracking_password, db_session=db_session) IssueScanner.scan_for_repository(repository) db_session.close()
def init_db_sessions(self, db_session=None): """ Init DB session. When treahding is activated it creates one db session per thread Args: db_session: Optional if not specified it will create a new one """ if db_session == None: self.db_session = DB.create_session() else: self.db_session = db_session
def scan_for_repository(repository): """ Scans the issue tracking of a repository in the DB and assigns issues to commits. Iterates through all recorded commits of this repository, checks their commit message for issue references, trys to retrieve those issues from the associated issue tracking system and saves them in the DB. Args: repository (Repository): The the repository to scan. """ assert isinstance(repository, Repository) reset_issue_cache() # get issue tracking object Log.info("Retrieving IssueTracking for Repository " + repository.name + " with id " + str(repository.id)) db_session = DB.create_session() query = db_session.query(IssueTracking).filter(IssueTracking.repository == repository) try: issue_tracking = query.one() except NoResultFound: Log.error("No IssueTracking-Entry found for Repository " + repository.name + " with id " + str(repository.id)) db_session.close() return Log.debug("IssueTracking found. Type: " + str(issue_tracking.type)) if issue_tracking.type == TYPE_GITHUB: retrieve = GitHub.retrieve extract_pattern = '#[0-9]+' transform = lambda x: x[1:] elif issue_tracking.type == TYPE_JIRA: retrieve = Jira.retrieve extract_pattern = Config.issue_scanner_issue_id_regex if not extract_pattern: extract_pattern = '[A-Z][A-Z]+-[0-9]*' # default extract pattern, not really good transform = None else: Log.error("No Implementation found for IssueTracking-Type '" + str(issue_tracking.type) + "'") db_session.close() return repository = issue_tracking.repository for commit in repository.commits: issue_ids = extract_issue_ids(commit.message, extract_pattern, transform=transform) for issue_id in issue_ids: process_issue(issue_tracking, commit, issue_id, retrieve, db_session) Log.info("Issue Analysis completed") db_session.close() reset_issue_cache()
def assign_issue_tracking(repository, issue_tracking_type, url, username=None, password=None, db_session=None): """ Assigns Args: repository (Repository): The repository (ORM-Object) to assign the issue tracking to. issue_tracking_type (str): The issue tracking system type. Use one of the TYPE_X constants from IssueTracking. url (str): The url for the issue tracking API. username (str): Optional. The username for authentication. password (str): Optional. The password for authentication. db_session (Session): Optional. The db session to use. If not provided, a new one will be created. """ assert isinstance(repository, Repository) close_db_session = False if not db_session: db_session = DB.create_session() close_db_session = True if repository.issueTracking is not None: Log.info("Repository " + repository.name + " with id " + str( repository.id) + " already has an issue tracker assigned") repository.issueTracking.type = issue_tracking_type repository.issueTracking.url = url repository.issueTracking.username = username repository.issueTracking.password = password db_session.commit() else: Log.info( "Creating new " + issue_tracking_type + " IssueTracking for Repository " + repository.name + " with id " + str(repository.id)) issue_tracking = IssueTracking( repository=repository, type=issue_tracking_type, url=url, username=username, password=password ) db_session.add(issue_tracking) repository.issueTracking = issue_tracking db_session.commit() if close_db_session: db_session.close()
def __process_commit(self, commit, previous_commit, project_size, project_file_count, db_session=None): """Process a single commit. Args: commit: Actual commit previous_commit: Previous commit for creating differences project_size: Actual size of the project project_file_count: Acutal filecount of the project db_session: db session... Returns: commit_orm object """ db_session_local = None if not db_session: db_session_local = True db_session = DB.create_session() added_files_thread = None changed_files_thread = None deleted_files_thread = None manipulated_files = self.__get_changed_files(commit, previous_commit) added_files = manipulated_files['added_files'] added_files_count = len(added_files) deleted_files = manipulated_files['deleted_files'] deleted_files_count = len(deleted_files) changed_files = manipulated_files['changed_files'] changed_files_count = len(changed_files) renamed_files = manipulated_files['renamed_files'] renamed_files_count = len(renamed_files) files_diff = manipulated_files['files_diff'] new_project_file_count = project_file_count + added_files_count - deleted_files_count commit_time = datetime.datetime.utcfromtimestamp(commit.committed_date) commit_id = str(commit) commit_orm = self.__create_new_commit(db_session, commit_id, self.repository_id, commit.message, commit.author.email, commit_time, 0, 0, 0, 0, project_size, new_project_file_count) # no files were changed at all / very unlikley if (not added_files) and (not deleted_files) and (not changed_files) and (not renamed_files) and ( not renamed_files): return commit_orm if added_files: for file in added_files: programming_language = self.__get_programming_langunage(file.path) file_orm = self.__create_new_file(db_session, self.repository_id, programming_language) created_version = self.__create_new_version(db_session, file_orm.id, commit_id, 0, 0, 0, file.path) # skip this file because language is not interessting for us if not programming_language: added_files_count -= 1 continue self.__process_file_diff(db_session, commit_id, file, files_diff, created_version) if deleted_files: for file in deleted_files: programming_language = self.__get_programming_langunage(file.path) if not programming_language: deleted_files_count -= 1 try: version_orm = self.__process_deleted_or_changed_file(db_session, commit_id, file, programming_language, files_diff) version_orm.deleted = True version_orm.file_size = 0 except ValueError as e: Log.warning("Warning processing commit: " + str(commit_id) + ". File affected: " + str( file.path) + " Reason: " + str(e)) if changed_files: for file in changed_files: programming_language = self.__get_programming_langunage(file.path) if not programming_language: changed_files_count -= 1 try: self.__process_deleted_or_changed_file(db_session, commit_id, file, programming_language, files_diff) except ValueError as e: Log.warning("Warning processing commit: " + str(commit_id) + ". File affected: " + str( file.path) + " Reason: " + str(e)) # for renamed files just create a new one and link to the old one if renamed_files: for file in renamed_files: old_file = file['old_file'] new_file = file['new_file'] old_version_orm = db_session.query(Commit, Version).filter(Commit.id == Version.commit_id, Version.path == str(old_file.path), Commit.repository_id == str( self.repository_id)).order_by( desc(Commit.timestamp)).first() programming_language = self.__get_programming_langunage(new_file.path) if not old_version_orm: Log.warning("Could not process commit " + str( commit_id) + ". Could not process rename because old file was not found. Old file: " + str( old_file.path) + " new file: " + str(new_file.path)) file_orm = self.__create_new_file(db_session, self.repository_id, programming_language) old_version_orm = self.__create_new_version(db_session, file_orm.id, commit_id, 0, 0, 0, new_file.path) version_orm = old_version_orm else: old_version_orm = old_version_orm.Version version_orm = self.__create_new_version(db_session, old_version_orm.file_id, commit_id, 0, 0, 0, new_file.path) # skip this file because language is not interessting for us if not programming_language: renamed_files_count -= 1 continue version_orm.file_size = old_version_orm.file_size self.__process_file_diff(db_session, commit_id, new_file, files_diff, version_orm) commit_orm.added_files_count = added_files_count commit_orm.deleted_files_count = deleted_files_count commit_orm.changed_files_count = changed_files_count commit_orm.renamed_files_count = renamed_files_count if added_files_thread: added_files_thread.join() if changed_files_thread: changed_files_thread.join() if deleted_files_thread: deleted_files_thread.join() if db_session_local: db_session.close() return commit_orm
def get_dataset_from_db(repository, start, end, feature_list, target_id, ngram_sizes=None, ngram_levels=None, label="", eager_load=False, sparse=False): """ Reads a dataset from a repository in a specific time range Args: repository (Repository): The repository to query. Can also be its name as a string start (datetime): The start range end (datetime): The end range feature_list (list[str]): A list of the feature-IDs to be read into the dataset. target_id (str): The ID of the target. Use a TARGET_X constant from UpcomingBugsForVersion ngram_sizes (list[int]): Optional. The ngram-sizes to be loaded in the set (e.g. [1, 2] for 1-grams and 2-grams) ngram_levels (list[int]): Optional. The ngram-levels to be loaded in the dataset. label (str): The label to be assigned to the dataset. eager_load (bool): If true, all data will be loaded eagerly. This reduces database calls, but uses a lot of RAM. sparse (bool): If the data and target matrices should be sparse. Recommended in combination with ngrams. Returns: Dataset: The populated dataset. """ if ngram_sizes and type(ngram_sizes) != list: ngram_sizes = [ngram_sizes] if ngram_levels and type(ngram_levels) != list: ngram_sizes = [ngram_levels] use_ngrams = True if ngram_sizes and ngram_levels else False session = DB.create_session() if type(repository) is str: repository_name = repository repository = get_repository_by_name(session, repository_name) if repository is None: logging.error( "Repository with name %s not found! Returning no Dataset" % repository_name) return None commits = get_commits_in_range(session, repository, start, end, eager_load_ngrams=use_ngrams and eager_load, eager_load_features=eager_load) if commits is None: logging.error("Could not retrieve commits! Returning no Dataset") return None logging.debug("Commits received.") if len(commits) == 0: logging.error("No Commits found!") return None versions = [] for commit in commits: versions += commit.versions logging.debug("%i commits with %i versions found." % (len(commits), len(versions))) feature_count = len(feature_list) logging.debug("%i features found." % feature_count) ngram_count = 0 if use_ngrams: ngrams = get_ngram_vector_list(versions[0], ngram_sizes, ngram_levels) ngram_count = sum([ngram.vector_size for ngram in ngrams]) logging.debug( "Ngram sizes %s and levels %s amount to %i total ngrams." % (str(ngram_sizes), str(ngram_levels), ngram_count)) dataset = Dataset(feature_count + ngram_count, len(versions), feature_list, target_id, start, end, ngram_sizes, ngram_levels, label, sparse=sparse, dok=True) i = 0 for version in versions: if len(version.upcoming_bugs) == 0: raise Exception( "Version %s has no upcoming_bugs entry. Can't retrieve target!" % version.id) target = version.upcoming_bugs[0].get_target(target_id) if target is None: raise Exception( "Upcoming_bugs entry of Version %s has no target %s!" % (version.id, target)) dataset.target[i] = target j = 0 for feature_value in version.feature_values: if feature_value.feature_id in feature_list: if not sparse or feature_value.value != 0: dataset.data[i, j] = feature_value.value j += 1 if use_ngrams: for ngram_vector in get_ngram_vector_list(version, ngram_sizes, ngram_levels): for ngram_value in ngram_vector.ngram_values.split(','): ngram_value = int(ngram_value) if not sparse or ngram_value != 0: dataset.data[i, j] = ngram_value j += 1 if i % 100 == 0: logging.info("{0:.2f}% of versions processed.".format( i / len(versions) * 100)) i += 1 logging.info("All versions processed.") if sparse: dataset.to_csr() session.close() return dataset