def _update_existing_references(self, repo_id, import_type): pairs = self._get_new_commit_file_pairs(repo_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( pairs, self._num_processes) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, intervals, results) for interval in intervals: issue_extractor = Code2DbCommitFile(self._db_name, self._git_repo_path, interval, import_type, self._config, self._log_path) queue_intervals.put(issue_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join()
def _update_channels(self, instant_messaging_id): #updates channels of a instant messaging channel_ids = self._dao.get_channel_ids(instant_messaging_id) if channel_ids: intervals = [ i for i in multiprocessing_util.get_tasks_intervals( channel_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) for i in range(len(intervals)): channel_extractor = SlackChannel2Db( self._db_name, instant_messaging_id, intervals[i], self._tokens[i], self._config, self._log_path) queue_extractors.put(channel_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join()
def _get_channels(self, instant_messaging_id): #processes Slack channels channel_ids = self._get_channel_ids(instant_messaging_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( channel_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) pos = 0 for interval in intervals: topic_extractor = SlackChannel2Db(self._db_name, instant_messaging_id, interval, self._tokens[pos], self._config, self._log_path) queue_extractors.put(topic_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join()
def _get_topics(self, forum_id): #update topics of a forum topic_ids = self._dao.get_topic_ids(forum_id) if topic_ids: self._update_topics_info(forum_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, self._num_processes) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_extractors, results) for interval in intervals: topic_extractor = EclipseTopic2Db(self._db_name, forum_id, interval, self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_extractors) # Wait for all of the tasks to finish queue_extractors.join()
def _get_info_contribution(self, repo_id): #processes Git data existing_refs = self._get_existing_references(repo_id) queue_references = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_references, results) for reference in self._querier.get_references(): if self._references: if reference[0] in self._references: git_ref_extractor = Git2DbReference( self._db_name, repo_id, self._git_repo_path, self._before_date, self._import_type, reference[0], "", self._config, self._log_path) queue_references.put(git_ref_extractor) else: if reference[0] not in existing_refs: git_ref_extractor = Git2DbReference( self._db_name, repo_id, self._git_repo_path, self._before_date, self._import_type, reference[0], "", self._config, self._log_path) queue_references.put(git_ref_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_references) # Wait for all of the tasks to finish queue_references.join()
def _insert_issue_data(self, repo_id, issue_tracker_id): #processes issue data imported = self._dao.get_already_imported_issue_ids( issue_tracker_id, repo_id) issues = list( set(self._querier.get_issue_ids(self._before_date)) - set(imported)) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( issues, len(self._tokens)) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_extractor = GitHubIssue2Db(self._db_name, repo_id, issue_tracker_id, self._url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join()
def _get_topics(self, forum_id): # updates topics of a forum topic_ids = self._dao.get_topic_own_ids(forum_id) if topic_ids: intervals = [ i for i in multiprocessing_util.get_tasks_intervals( topic_ids, len(self._tokens)) if len(i) > 0 ] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) for i in range(len(intervals)): topic_extractor = StackOverflowTopic2Db( self._db_name, forum_id, intervals[i], self._tokens[i], self._config, self._log_path) queue_extractors.put(topic_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join()
def _insert_issue_dependencies(self, repo_id, issue_tracker_id): # processes issue dependency data issues = self._dao.get_already_imported_issue_ids( issue_tracker_id, repo_id) intervals = [ i for i in multiprocessing_util.get_tasks_intervals( issues, self._num_processes) if len(i) > 0 ] queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results) for interval in intervals: issue_dependency_extractor = BugzillaIssueDependency2Db( self._db_name, repo_id, issue_tracker_id, self._url, self._product, interval, self._config, self._log_path) queue_intervals.put(issue_dependency_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join()
def _get_topics(self, forum_id): # processes Stackoverflow questions topic_imported = self._dao.get_topic_own_ids(forum_id) topic_ids = list(set(self._querier.get_topic_ids(self._search_query, self._before_date)) - set(topic_imported)) topic_ids.sort() intervals = [i for i in multiprocessing_util.get_tasks_intervals(topic_ids, len(self._tokens)) if len(i) > 0] queue_extractors = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_extractors, results) pos = 0 for interval in intervals: topic_extractor = StackOverflowTopic2Db(self._db_name, forum_id, interval, self._tokens[pos], self._config, self._log_path) queue_extractors.put(topic_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_extractors) # Wait for all of the tasks to finish queue_extractors.join()
def _update_existing_references(self, repo_id, import_type): # updates existing references in the DB cursor = self._dao.get_cursor() query = "SELECT c.sha, lc.ref_id " \ "FROM commit c " \ "JOIN (SELECT ref_id, max(commit_id) as last_commit_id_in_ref " \ "FROM commit_in_reference WHERE repo_id = %s GROUP BY ref_id) as lc " \ "ON c.id = lc.last_commit_id_in_ref" arguments = [repo_id] self._dao.execute(cursor, query, arguments) queue_references = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_references, results) row = self._dao.fetchone(cursor) while row: sha = row[0] ref_id = row[1] row = self._dao.fetchone(cursor) ref_name = self._dao.select_reference_name(repo_id, ref_id) for reference in self._querier.get_references(): reference_name = reference[0] if reference_name == ref_name: self._existing_refs.append(ref_name) git_ref_extractor = Git2DbReference( self._db_name, repo_id, self._git_repo_path, self._before_date, import_type, reference[0], sha, self._config, self._log_path) queue_references.put(git_ref_extractor) break self._dao.close_cursor(cursor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_references) # Wait for all of the tasks to finish queue_references.join()
def _update_issue_dependency(self, repo_id, issue_tracker_id, intervals, url): #updates issue dependencies already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(self._num_processes, queue_intervals, results) for interval in intervals: issue_dependency_extractor = BugzillaIssueDependency2Db(self._db_name, repo_id, issue_tracker_id, url, self._product, interval, self._config, self._log_path) queue_intervals.put(issue_dependency_extractor) # Add end-of-queue markers multiprocessing_util.add_poison_pills(self._num_processes, queue_intervals) # Wait for all of the tasks to finish queue_intervals.join()
def _update_issue_content(self, repo_id, issue_tracker_id, intervals, url): # updates issues already stored in the DB queue_intervals = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers multiprocessing_util.start_consumers(len(self._tokens), queue_intervals, results) pos = 0 for interval in intervals: issue_extractor = GitHubIssue2Db(self._db_name, repo_id, issue_tracker_id, url, interval, self._tokens[pos], self._config, self._log_path) queue_intervals.put(issue_extractor) pos += 1 # Add end-of-queue markers multiprocessing_util.add_poison_pills(len(self._tokens), queue_intervals) # Wait for all of the tasks to finish queue_intervals.join()