def setUp(self): with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/issue1.json", 'r', encoding='utf-8') as \ issue_1_file: self.issue_1 = json.load(issue_1_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/issue95.json", 'r', encoding='utf-8') as \ issue_95_file: self.issue_95 = json.load(issue_95_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/issue95_comments.json", 'r', encoding='utf-8') as \ issue_95_comments_file: self.issue_95_comments = json.load(issue_95_comments_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/issue95_history.json", 'r', encoding='utf-8') as \ issue_95_history_file: self.issue_95_history = json.load(issue_95_history_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/conor_apache_org_user.json", 'r', encoding='utf-8') as \ conor_user_file: self.conor_user = json.load(conor_user_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/dev_tomcat_apache_org_user.json", 'r', encoding='utf-8') as \ dev_tomcat_file: self.dev_tomcat_file = json.load(dev_tomcat_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/bugzilla/craig_mcclanahan_user.json", 'r', encoding='utf-8') as \ craig_user_file: self.craig_user = json.load(craig_user_file) # Create testconfig config = configparser.ConfigParser() config.read( os.path.dirname(os.path.realpath(__file__)) + "/data/used_test_config.cfg") # Setting up database with data that is normally put into it via vcs program connect(config['Database']['db_database'], username=config['Database']['db_user'], password=config['Database']['db_password'], host=config['Database']['db_hostname'], port=int(config['Database']['db_port']), authentication_source=config['Database']['db_authentication'], connect=False) Project.drop_collection() IssueSystem.drop_collection() Issue.drop_collection() IssueComment.drop_collection() Event.drop_collection() self.project_id = Project(name='Bla').save().id self.issues_system_id = IssueSystem( project_id=self.project_id, url="https://issues.apache.org/search?jql=project=BLA", last_updated=datetime.datetime.now()).save().id self.conf = ConfigMock(None, None, None, None, None, None, 'Bla', 'Nonsense?product=Blub', 'bugzilla', None, None, None, None, None, None, 'DEBUG', '123')
def setUp(self): # Setup logging self.input_path_python = os.path.dirname( os.path.realpath(__file__)) + '/data/python_project' self.input_path_java = os.path.dirname( os.path.realpath(__file__)) + '/data/java_project2' self.out_java = os.path.dirname( os.path.realpath(__file__)) + '/data/out_java' # Clear database first (we need a small hack here, as mongomocks drop_database does not work) Project.drop_collection() VCSSystem.drop_collection() Commit.drop_collection() File.drop_collection() self.project_id = Project(name="zookeeper").save().id self.vcs_id = VCSSystem(url="http://test.de", project_id=self.project_id, repository_type="test").save().id self.commit_id = Commit(revision_hash="2342", vcs_system_id=self.vcs_id).save() self.file1 = File(path="contribs/CekiGulcu/AppenderTable.java", vcs_system_id=self.vcs_id).save() self.file2 = File(path="contribs/LeosLiterak/TempFileAppender.java", vcs_system_id=self.vcs_id).save() self.file3 = File( path="src/main/java/org/apache/log4j/AsyncAppender.java", vcs_system_id=self.vcs_id).save() shutil.rmtree(self.out_java, ignore_errors=True) shutil.rmtree(self.input_path_java, ignore_errors=True) os.makedirs(self.out_java) # Copying up fake files that are generated by SourceMeter self.class_csv = os.path.dirname( os.path.realpath(__file__)) + '/data/csv_data/zookeeper-Class.csv' self.package_csv = os.path.dirname(os.path.realpath( __file__)) + '/data/csv_data/zookeeper-Package.csv' self.component_csv = os.path.dirname(os.path.realpath( __file__)) + '/data/csv_data/zookeeper-Component.csv' shutil.copy(self.class_csv, self.out_java) shutil.copy(self.package_csv, self.out_java) shutil.copy(self.component_csv, self.out_java) # Create Files and directories os.makedirs(self.input_path_java + '/contribs/CekiGulcu') os.makedirs(self.input_path_java + '/contribs/LeosLiterak') os.makedirs(self.input_path_java + '/src/main/java/org/apache/log4j') Path(self.input_path_java + '/contribs/CekiGulcu/AppenderTable.java').touch() Path(self.input_path_java + '/contribs/LeosLiterak/TempFileAppender.java').touch() Path(self.input_path_java + '/src/main/java/org/apache/log4j/AsyncAppender.java').touch()
def setUp(self): with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/people.json", 'r', encoding='utf-8') as people_file: self.person = json.load(people_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6131.json", 'r', encoding='utf-8') as issues_file: self.issue_6131 = json.load(issues_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6131_events.json", 'r', encoding='utf-8') as event_file: self.events_issue_6131 = json.load(event_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6131_comments.json", 'r', encoding='utf-8') as cmt_file: self.comments_issue_6131 = json.load(cmt_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6050.json", 'r', encoding='utf-8') as issues_file: self.issue_6050 = json.load(issues_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6050_events.json", 'r', encoding='utf-8') as event_file: self.events_issue_6050 = json.load(event_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/github/issue_6050_comments.json", 'r', encoding='utf-8') as cmt_file: self.comments_issue_6050 = json.load(cmt_file) # Create testconfig config = configparser.ConfigParser() config.read(os.path.dirname(os.path.realpath(__file__)) + "/data/used_test_config.cfg") # Setting up database with data that is normally put into it via vcs program connect(config['Database']['db_database'], username=config['Database']['db_user'], password=config['Database']['db_password'], host=config['Database']['db_hostname'], port=int(config['Database']['db_port']), authentication_source=config['Database']['db_authentication'], connect=False) Project.drop_collection() IssueSystem.drop_collection() Issue.drop_collection() IssueComment.drop_collection() Event.drop_collection() self.project_id = Project(name='Composer').save().id self.issues_system_id = IssueSystem(project_id=self.project_id, url="http://blub.de", last_updated=datetime.datetime.now()).save().id self.conf = ConfigMock(None, None, None, None, None, None, 'Ant', 'http://blub.de', 'github', None, None, None, None, None, None, 'DEBUG', '123')
def initialize(self, config, repository_url, repository_type): """Initializes the mongostore by connecting to the mongodb, creating the project in the project collection \ and setting up processes (see: :class:`pyvcsshark.datastores.mongostore.CommitStorageProcess`, which read commits out of the commitqueue, process them and store them into the mongodb. :param config: all configuration :param repository_url: url of the repository, which is to be analyzed :param repository_type: type of the repository, which is to be analyzed (e.g. "git") """ logger.setLevel(config.debug_level) logger.info("Initializing MongoStore...") # Create queue for multiprocessing self.commit_queue = multiprocessing.JoinableQueue() # We define, that the user we authenticate with is in the admin database logger.info("Connecting to MongoDB...") uri = create_mongodb_uri_string(config.db_user, config.db_password, config.db_hostname, config.db_port, config.db_authentication, config.ssl_enabled) connect(config.db_database, host=uri, connect=False) # Get project_id try: project_id = Project.objects(name=config.project_name).get().id except DoesNotExist: logger.error('Project with name "%s" does not exist in database!' % config.project_name) sys.exit(1) # Check if vcssystem already exist, and use upsert vcs_system_id = VCSSystem.objects(url=repository_url).upsert_one( url=repository_url, repository_type=repository_type, last_updated=datetime.datetime.today(), project_id=project_id).id # Get the last commit by date of the project (if there is any) last_commit = Commit.objects(vcs_system_id=vcs_system_id)\ .only('committer_date').order_by('-committer_date').first() if last_commit is not None: last_commit_date = last_commit.committer_date else: last_commit_date = None # Start worker, they will wait till something comes into the queue and then process it for i in range(self.NUMBER_OF_PROCESSES): name = "StorageProcess-%d" % i process = CommitStorageProcess(self.commit_queue, vcs_system_id, last_commit_date, config, name) process.daemon = True process.start() logger.info("Starting storage Process...")
def main(args): # timing start = timeit.default_timer() if args.log_level and hasattr(logging, args.log_level): log.setLevel(getattr(logging, args.log_level)) uri = create_mongodb_uri_string(args.db_user, args.db_password, args.db_hostname, args.db_port, args.db_authentication, args.ssl) connect(args.db_database, host=uri) # Get the id of the project for which the code entities shall be merged try: project_id = Project.objects(name=args.project_name).get().id except DoesNotExist: log.error('Project %s not found!' % args.project_name) sys.exit(1) vcs = VCSSystem.objects(project_id=project_id).get() log.info("Starting commit labeling") # import every approach defined or all if args.approaches == 'all': # just list every module in the package and import it basepath = os.path.dirname(os.path.abspath(__file__)) for app in os.listdir(os.path.join(basepath, 'approaches/')): if app.endswith('.py') and app != '__init__.py': __import__('approaches.{}'.format(app[:-3])) else: # if we have a list of approaches import only those for app in args.approaches.split(','): __import__('approaches.{}'.format(app)) # add specific configs labelshark = LabelSHARK() commit_count = Commit.objects(vcs_system_id=vcs.id).count() for i,commit in enumerate(Commit.objects(vcs_system_id=vcs.id).only('id', 'revision_hash', 'vcs_system_id', 'message', 'linked_issue_ids', 'parents', 'fixed_issue_ids', 'szz_issue_ids').timeout(False)): if i%100 == 0: log.info("%i/%i commits finished", i, commit_count) labelshark.set_commit(commit) labels = labelshark.get_labels() #log.info('commit: {}, labels: {}'.format(commit.revision_hash, labels)) # save the labels if labels: tmp = {'set__labels__{}'.format(k): v for k, v in labels} Commit.objects(id=commit.id).upsert_one(**tmp) end = timeit.default_timer() - start log.info("Finished commit labeling in {:.5f}s".format(end))
def start(self, cfg): """ Starts the collection process :param cfg: holds all configuration parameters. Object of class :class:`~issueshark.config.Config` """ logger.setLevel(cfg.get_debug_level()) start_time = timeit.default_timer() # Connect to mongodb uri = create_mongodb_uri_string(cfg.user, cfg.password, cfg.host, cfg.port, cfg.authentication_db, cfg.ssl_enabled) connect(cfg.database, host=uri) # Get the project for which issue data is collected try: project_id = Project.objects(name=cfg.project_name).get().id except DoesNotExist: logger.error('Project %s not found!' % cfg.project_name) sys.exit(1) # Create issue system if not already there try: issue_system = IssueSystem.objects(url=cfg.tracking_url).get() except DoesNotExist: issue_system = IssueSystem(project_id=project_id, url=cfg.tracking_url).save() issue_system.last_updated = datetime.datetime.now() issue_system.save() # Find correct backend backend = BaseBackend.find_fitting_backend(cfg, issue_system.id, project_id) logger.debug("Using backend: %s" % backend.identifier) # Process the issues for the corresponding project_id backend.process() elapsed = timeit.default_timer() - start_time logger.info("Execution time: %0.5f s" % elapsed)
def setUp(self): with open(os.path.dirname(os.path.realpath(__file__)) + "/data/jira/drill_1_issue.json", 'r', encoding='utf-8') as drill_1: self.issue_drill_1 = json.load(drill_1) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/jira/drill_138_issue.json", 'r', encoding='utf-8') as drill_138: self.issue_drill_138 = json.load(drill_138) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/jira/drill_38_issue.json", 'r', encoding='utf-8') as drill_38: self.issue_drill_38 = json.load(drill_38) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/jira/get_user1.json", 'r', encoding='utf-8') as user1_file: self.user1 = json.load(user1_file) with open(os.path.dirname(os.path.realpath(__file__)) + "/data/jira/get_user2.json", 'r', encoding='utf-8') as user2_file: self.user2 = json.load(user2_file) # Create testconfig config = configparser.ConfigParser() config.read( os.path.dirname(os.path.realpath(__file__)) + "/data/used_test_config.cfg") # Setting up database with data that is normally put into it via vcs program connect(config['Database']['db_database'], username=config['Database']['db_user'], password=config['Database']['db_password'], host=config['Database']['db_hostname'], port=int(config['Database']['db_port']), authentication_source=config['Database']['db_authentication'], connect=False) Project.drop_collection() IssueSystem.drop_collection() Issue.drop_collection() IssueComment.drop_collection() Event.drop_collection() self.project_id = Project(name='Bla').save().id self.issues_system_id = IssueSystem( project_id=self.project_id, url="https://issues.apache.org/search?jql=project=BLA", last_updated=datetime.datetime.now()).save().id self.conf = ConfigMock( None, None, None, None, None, None, 'Bla', 'https://issues.apache.org/search?jql=project=BLA', 'jira', None, None, None, None, None, None, 'DEBUG', '123')
def initialize(self, config, repository_url, repository_type): """Initializes the mongostore by connecting to the mongodb, creating the project in the project collection \ and setting up processes (see: :class:`pyvcsshark.datastores.mongostore.CommitStorageProcess`, which read commits out of the commitqueue, process them and store them into the mongodb. :param config: all configuration :param repository_url: url of the repository, which is to be analyzed :param repository_type: type of the repository, which is to be analyzed (e.g. "git") """ logger.setLevel(config.debug_level) logger.info("Initializing MongoStore...") # Create queue for multiprocessing self.commit_queue = multiprocessing.JoinableQueue() # we need an extra queue for branches because all commits need to be finished before we can process branches self.branch_queue = multiprocessing.JoinableQueue() self.config = config self.cores_per_job = config.cores_per_job # We define, that the user we authenticate with is in the admin database logger.info("Connecting to MongoDB...") uri = create_mongodb_uri_string(config.db_user, config.db_password, config.db_hostname, config.db_port, config.db_authentication, config.ssl_enabled) connect(config.db_database, host=uri, connect=False) # Get project_id try: project_id = Project.objects(name=config.project_name).get().id except DoesNotExist: logger.error('Project with name "%s" does not exist in database!' % config.project_name) sys.exit(1) # Check if vcssystem already exist, and use upsert vcs_system = VCSSystem.objects(url=repository_url).upsert_one( url=repository_url, repository_type=repository_type, last_updated=datetime.datetime.today(), project_id=project_id) self.vcs_system_id = vcs_system.id # Tar.gz name based on project name tar_gz_name = '{}.tar.gz'.format(config.project_name) # Tar.gz of repository folder with tarfile.open(tar_gz_name, "w:gz") as tar: tar.add(config.path, arcname=config.project_name) # Add repository to gridfs if not existent if vcs_system.repository_file.grid_id is None: logger.info('Copying project to gridfs...') # Store in gridfs with open(tar_gz_name, 'rb') as tar_file: vcs_system.repository_file.put(tar_file, content_type='application/gzip', filename=tar_gz_name) vcs_system.save() else: # replace file if not existent logger.info('Replacing project file in gridfs...') with open(tar_gz_name, 'rb') as tar_file: vcs_system.repository_file.replace( tar_file, content_type='application/gzip', filename=tar_gz_name) vcs_system.save() # Delete tar.gz file os.remove(tar_gz_name) # Get the last commit by date of the project (if there is any) last_commit = Commit.objects(vcs_system_id=self.vcs_system_id)\ .only('committer_date').order_by('-committer_date').first() if last_commit is not None: last_commit_date = last_commit.committer_date else: last_commit_date = None # Start worker, they will wait till something comes into the queue and then process it for i in range(self.cores_per_job): name = "StorageProcess-%d" % i process = CommitStorageProcess(self.commit_queue, self.vcs_system_id, last_commit_date, self.config, name) process.daemon = True process.start() logger.info("Starting storage Process...")
if line.startswith('-'): removed += line[1:].strip() elif line.startswith('+'): added += line[1:].strip() return removed != added count_file_actions = 0 for name, master_branch in vcs_systems: print('analyzing', name) commits_per_issue = {} issues_per_commit = {} try: project_id = Project.objects(name=name).get().id except DoesNotExist: print('unknown project:', name) sys.exit(1) cur_vcs_system = VCSSystem.objects(project_id=project_id).get().id cur_issue_system = IssueSystem.objects(project_id=project_id).get().id # 1) fetch commits print('fetching commit ids') issue_ids = ['LLOC'] last_commit = None commit_bug_map = {} for commit in Commit.objects(vcs_system_id=cur_vcs_system, committer_date__gte=date_start, committer_date__lt=date_end,
def start(self, cfg): """ Executes the linkSHARK. :param cfg: configuration object that is used """ self._log.setLevel(cfg.get_debug_level()) start_time = timeit.default_timer() uri = create_mongodb_uri_string(cfg.user, cfg.password, cfg.host, cfg.port, cfg.authentication_db, cfg.ssl_enabled) connect(cfg.database, host=uri) # Get the id of the project for which the code entities shall be merged try: project_id = Project.objects(name=cfg.project_name).get().id except DoesNotExist: self._log.error('Project %s not found!' % cfg.project_name) sys.exit(1) vcs_system = VCSSystem.objects(project_id=project_id).get() self._itss = [] self._log.info('found the following issue tracking systems:') for its in IssueSystem.objects(project_id=project_id).order_by('url'): self._log.info(its.url) self._itss.append(its) if len(cfg.correct_key) > 0: correct_keys_per_its = cfg.correct_key.split(';') if len(correct_keys_per_its) != len(self._itss): self._log_critical( '--correct-key must correct keys for all issue tracking systems if specified' ) sys.exit(1) for i, correct_key in enumerate(correct_keys_per_its): self._correct_key[self._itss[i].url] = correct_key if len(cfg.broken_keys) > 0: broken_keys_per_its = cfg.broken_keys.split(';') if len(broken_keys_per_its) != len(self._itss): self._log_critical( '--broken-keys must correct keys for all issue tracking systems if specified. If there are no keys to correct for one of the ITS just use the name of the correct key twice itself' ) sys.exit(1) for i, broken_keys in enumerate(broken_keys_per_its): self._broken_keys[self._itss[i].url] = broken_keys.split(',') self._log.info("Starting issue linking") commit_count = Commit.objects(vcs_system_id=vcs_system.id).count() issue_map = {} for i, issue_system in enumerate(self._itss): project_id_string = correct_keys_per_its[i] for issue in Issue.objects(issue_system_id=issue_system.id): if issue.external_id.startswith(project_id_string): try: issue_number = [ int(s) for s in issue.external_id.split('-') if s.isdigit() ][0] except IndexError: self._log.error( "index error because SZZ currently only support JIRA, may not link all issues correctly:", issue.external_id) continue if issue_number not in issue_map: issue_map[issue_number] = [issue] else: issue_map[issue_number].append(issue) for i, commit in enumerate( Commit.objects(vcs_system_id=vcs_system.id).only( 'id', 'revision_hash', 'vcs_system_id', 'message', 'author_id', 'committer_id')): if i % 100 == 0: self._log.info("%i/%i commits finished", i, commit_count) issue_links = self._get_issue_links(commit) if len(issue_links) > 0: commit.linked_issue_ids = issue_links commit.save() szz_links = self._get_szz_issue_links(commit, issue_map) if len(szz_links) > 0: commit.szz_issue_ids = szz_links commit.save() elapsed = timeit.default_timer() - start_time self._log.info("Execution time: %0.5f s" % elapsed)
def start(self, cfg): """ Starts the program :param cfg: configuration of class :class:`mailingshark.config.Config` """ logger.setLevel(cfg.get_debug_level()) start_time = timeit.default_timer() # Connect to mongodb uri = create_mongodb_uri_string(cfg.user, cfg.password, cfg.host, cfg.port, cfg.authentication_db, cfg.ssl_enabled) connect(cfg.database, host=uri) # Get the project for which issue data is collected try: project_id = Project.objects(name=cfg.project_name).get().id except DoesNotExist: logger.error('Project not found. Use vcsSHARK beforehand!') sys.exit(1) # Try to create the mailing_list in database try: mailing_list = MailingList.objects(project_id=project_id, name=cfg.mailing_url).get() except DoesNotExist: mailing_list = MailingList(project_id=project_id, name=cfg.mailing_url) mailing_list_id = mailing_list.save().id # Find correct backend backend = BaseDataCollector.find_fitting_backend(cfg, project_id) logger.debug("Using backend: %s" % backend.identifier) # Get a list of all file paths to boxes found_files = backend.download_mail_boxes(mailing_list) logger.debug("Got the following files: %s" % found_files) # Unpack boxes (if necessary) boxes_to_analyze = self._unpack_files(found_files, cfg.temporary_dir) logger.info("Analyzing the following files: %s" % boxes_to_analyze) stored_messages, non_stored = (0, 0) for path_to_box in boxes_to_analyze: box = mailbox.mbox(path_to_box, create=False) logger.info("Analyzing: %s" % path_to_box) for i in range(0, len(box)): try: parsed_message = ParsedMessage(cfg, box.get(i)) logger.debug('Got the following message: %s' % parsed_message) self._store_message(parsed_message, mailing_list_id) stored_messages += 1 except Exception as e: logger.error("Could not parse message. Error: %s" % e) non_stored += 1 # Update mailing list mailing_list.last_updated = datetime.datetime.now() mailing_list.save() logger.info("%d messages stored in database %s" % (stored_messages, cfg.database)) logger.info("%d messages ignored by the parser" % non_stored) elapsed = timeit.default_timer() - start_time logger.info("Execution time: %0.5f s" % elapsed)
# Establish connection uri = create_mongodb_uri_string(user, password, host, port, authentication_db, ssl_enabled) connect(database, host=uri) # Fetch project id and version control system id for the 'kafka' project # The only() decides the data that is actually retrieved from the MongoDB. Always restrict this to the field that you require! projects = ['ant-ivy', 'archiva', 'calcite', 'cayenne', 'commons-bcel', 'commons-beanutils', 'commons-codec', 'commons-collections', 'commons-compress', 'commons-configuration', 'commons-dbcp', 'commons-digester', 'commons-io', 'commons-jcs', 'commons-jexl', 'commons-lang', 'commons-math', 'commons-net', 'commons-rdf', 'commons-scxml'] rows_list = [] for projectName in projects: project = Project.objects(name=projectName).only('id').get() #vcs_system = VCSSystem.objects(project_id=project.id).only('id','url').get() #getting issue id from the project issue_id = IssueSystem.objects(project_id=project.id).only('id','url').get() ###########Getting data ready############ for issue in Issue.objects(issue_system_id=issue_id.id).only('issue_type','desc','title','priority', 'status').timeout(False): for row in issue: dict1 = {} dict1.update({'Id':issue_id.id}) dict1.update({'Description':issue.desc}) dict1.update({'Title':issue.title})
def start(): """ Compares the commits and code_entity_states of two MongoDBs, whereas the first MongoDB is is condensed with the memeSHARK and the second MongoDB is verbose. """ setup_logging() logger = logging.getLogger("main") logger.info("Starting consistency checker...") parser = argparse.ArgumentParser(description='DB consistency checker.') parser.add_argument('-v', '--version', help='Shows the version', action='version', version='0.1.0') parser.add_argument('-U1', '--db-user1', help='Database user name', default=None) parser.add_argument('-P1', '--db-password1', help='Database user password', default=None) parser.add_argument('-DB1', '--db-database1', help='Database name', default='smartshark') parser.add_argument( '-H1', '--db-hostname1', help='Name of the host, where the database server is running', default='localhost') parser.add_argument('-p1', '--db-port1', help='Port, where the database server is listening', default=27017, type=int) parser.add_argument('-a1', '--db-authentication1', help='Name of the authentication database', default=None) parser.add_argument('--ssl1', help='Enables SSL', default=False, action='store_true') parser.add_argument('-U2', '--db-user2', help='Database user name', default=None) parser.add_argument('-P2', '--db-password2', help='Database user password', default=None) parser.add_argument('-DB2', '--db-database2', help='Database name', default='smartshark_backup') parser.add_argument( '-H2', '--db-hostname2', help='Name of the host, where the database server is running', default='localhost') parser.add_argument('-p2', '--db-port2', help='Port, where the database server is listening', default=27017, type=int) parser.add_argument('-a2', '--db-authentication2', help='Name of the authentication database', default=None) parser.add_argument('--ssl2', help='Enables SSL', default=False, action='store_true') parser.add_argument( '--debug', help='Sets the debug level.', default='DEBUG', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) parser.add_argument('--project-name1', help='Name of the project.', default=None) parser.add_argument('--project-name2', help='Name of the project.', default=None) args = parser.parse_args() logger.info(args) logger.info("connecting to database 1 (condensed)...") uri1 = create_mongodb_uri_string(args.db_user1, args.db_password1, args.db_hostname1, args.db_port1, args.db_authentication1, args.ssl1) logger.info(uri1) connect(args.db_database1, host=uri1, alias='default') logger.info("connecting to database 2 (verbose)...") uri2 = create_mongodb_uri_string(args.db_user2, args.db_password2, args.db_hostname2, args.db_port2, args.db_authentication2, args.ssl2) logger.info(uri2) connect(args.db_database2, host=uri2, alias='db-verbose') # fetch all verbose commmits commits_verbose = [] with switch_db(Commit, 'db-verbose') as CommitVerbose: # fetch only commits for selected project try: project_id = Project.objects(name=args.project_name2).get().id except DoesNotExist: logger.error('Project %s not found!' % args.project_name2) sys.exit(1) vcs_systems = VCSSystem.objects(project_id=project_id).get().id logger.info("vcs_system_id: %s", vcs_systems) commit_objects = Commit.objects(vcs_system_id=vcs_systems) for cur_commit_verbose in commit_objects: commits_verbose.append(cur_commit_verbose) with switch_db(VCSSystem, 'default') as VCSSystemCondensed: # fetch only commits for selected project try: project_id = Project.objects(name=args.project_name1).get().id except DoesNotExist: logger.error('Project %s not found!' % args.project_name1) sys.exit(1) vcs_systems_condensed = VCSSystemCondensed.objects( project_id=project_id).get().id # fetch files verbose with switch_db(File, 'db-verbose') as FilesVerbose: files_verbose = {} for cur_file_verbose in FilesVerbose.objects( vcs_system_id=vcs_systems): files_verbose[cur_file_verbose.id] = cur_file_verbose.path with switch_db(File, 'default') as FilesCondensed: files_condensed = {} for cur_file_condensed in FilesCondensed.objects( vcs_system_id=vcs_systems_condensed): files_condensed[cur_file_condensed.id] = cur_file_condensed.path num_commits_verbose = len(commits_verbose) logger.info("num commits verbose: %i", num_commits_verbose) for commit_nr, commit_verbose in enumerate(commits_verbose): logger.info("processing commit %s (%i / %i)", commit_verbose.id, commit_nr + 1, num_commits_verbose) # fetch verbose CES ces_verbose = {} ces_verbose_by_id = {} with switch_db(CodeEntityState, 'db-verbose') as CodeEntityStateVerbose: for cur_ces_verbose in CodeEntityStateVerbose.objects( commit_id=commit_verbose.id): ces_verbose[ cur_ces_verbose.long_name + files_verbose[cur_ces_verbose.file_id]] = cur_ces_verbose ces_verbose_by_id[cur_ces_verbose.id] = cur_ces_verbose # fetch same commit in condensed DB with switch_db(Commit, 'default') as CommitCondensed: try: commit_condensed = CommitCondensed.objects( revision_hash=commit_verbose.revision_hash, vcs_system_id=vcs_systems_condensed).get() except: logger.info("commit %s not found in condensed db", commit_verbose.revision_hash) continue # fetch CES from condensed DB ces_condensed = {} ces_condensed_by_id = {} with switch_db(CodeEntityState, 'default') as CodeEntityStateCondensed: for ces_id in commit_condensed.code_entity_states: cur_ces_condensed = CodeEntityStateCondensed.objects( id=ces_id).get() ces_condensed[cur_ces_condensed.long_name + files_condensed[ cur_ces_condensed.file_id]] = cur_ces_condensed ces_condensed_by_id[cur_ces_condensed.id] = cur_ces_condensed logger.info("num CES verbose : %i", len(ces_verbose.keys())) logger.info("num CES condensed: %i", len(ces_condensed.keys())) ces_unequal = 0 # compare CES for long_name_verbose, cur_ces_verbose in ces_verbose.items(): if long_name_verbose not in ces_condensed: logger.error( "CES with long_name %s not found in condensed DB!", long_name_verbose) ces_unequal += 1 continue cur_ces_condensed = ces_condensed[long_name_verbose] old, new = compare_dicts(cur_ces_verbose, cur_ces_condensed, { 'id', 's_key', 'commit_id', 'ce_parent_id', 'cg_ids', 'file_id' }) if len(new.keys()) > 0 or len(old.keys()) > 0: logger.error( "CES with long_name %s (id verbose: %s /id condensed %s) not equal!", long_name_verbose, cur_ces_verbose.id, cur_ces_condensed.id) logger.error("verbose : %s", old) logger.error("condensed: %s", new) ces_unequal += 1 continue # check if CES parent is equal ces_parent_verbose = ces_verbose_by_id[cur_ces_verbose.id] ces_parent_condensed = ces_condensed_by_id[cur_ces_condensed.id] old, new = compare_dicts\ (ces_parent_verbose, ces_parent_condensed, {'id', 's_key', 'commit_id', 'ce_parent_id', 'cg_ids', 'file_id'}) if len(new.keys()) > 0 or len(old.keys()) > 0: logger.error("ce_parent of CES with long_name %s not equal!", long_name_verbose) logger.error("verbose : %s", old) logger.error("condensed: %s", new) ces_unequal += 1 continue logger.info("num CES from verbose not matched: %i", ces_unequal)