def GetPersonIdentifiers(identities_db, upeople_id): """ Get people, company and country information """ res = None q = """ SELECT pro.uuid, pro.name, pro.email, cou.name as country, org.name as affiliation FROM %s.profiles pro JOIN %s.enrollments enr ON enr.uuid= pro.uuid JOIN %s.organizations org ON org.id = enr.organization_id LEFT JOIN %s.countries cou ON cou.code = pro.country_code WHERE pro.uuid ='%s' """ % (identities_db, identities_db, identities_db, identities_db, upeople_id) try: res = ExecuteQuery(q) except: # No organizations. Just people data and country data. q = """ SELECT pro.uuid, pro.name, pro.email, cou.name as country FROM %s.profiles pro LEFT JOIN %s.countries cou ON cou.code = pro.country_code WHERE pro.uuid ='%s' """ % (identities_db, identities_db, upeople_id) res = ExecuteQuery(q) return res
def _remove_issue(cls, issue_id): # Backend name its_type = cls._get_backend().its_type db_ext = its_type if its_type == "lp": db_ext = "launchpad" elif its_type == "bg": db_ext = "bugzilla" # attachments q = "DELETE FROM attachments WHERE issue_id='%s'" % (issue_id) ExecuteQuery(q) # changes q = "DELETE FROM changes WHERE issue_id='%s'" % (issue_id) ExecuteQuery(q) # comments q = "DELETE FROM comments WHERE issue_id='%s'" % (issue_id) ExecuteQuery(q) # related_to q = "DELETE FROM related_to WHERE issue_id='%s'" % (issue_id) ExecuteQuery(q) # issues_ext_bugzilla q = "DELETE FROM issues_ext_%s WHERE issue_id='%s'" % (db_ext, issue_id) ExecuteQuery(q) # issues_log_bugzilla q = "DELETE FROM issues_log_%s WHERE issue_id='%s'" % (db_ext, issue_id) ExecuteQuery(q) # issues_watchers q = "DELETE FROM issues_watchers WHERE issue_id='%s'" % (issue_id) ExecuteQuery(q) # issues q = "DELETE FROM issues WHERE id='%s'" % (issue_id) ExecuteQuery(q)
def _remove_scmlog(scmlog_id): # Get actions and remove mappings q = "SELECT * from actions where commit_id='%s'" % (scmlog_id) res = ExecuteQuery(q) if 'id' in res: if not isinstance(res['id'], list): res['id'] = [res['id']] for action_id in res['id']: # action_files is a view # q = "DELETE FROM action_files WHERE action_id='%s'" % (action_id) # ExecuteQuery(q) q = "DELETE FROM file_copies WHERE action_id='%s'" % ( action_id) ExecuteQuery(q) # actions_file_names is a VIEW # q = "DELETE FROM actions_file_names WHERE commit_id='%s'" % (scmlog_id) # ExecuteQuery(q) q = "DELETE FROM commits_lines WHERE commit_id='%s'" % (scmlog_id) ExecuteQuery(q) q = "DELETE FROM file_links WHERE commit_id='%s'" % (scmlog_id) ExecuteQuery(q) q = "SELECT tag_id from tag_revisions WHERE commit_id='%s'" % ( scmlog_id) res = ExecuteQuery(q) for tag_id in res['tag_id']: q = "DELETE FROM tags WHERE id='%s'" % (tag_id) ExecuteQuery(q) q = "DELETE FROM tag_revisions WHERE tag_id='%s'" % (tag_id) ExecuteQuery(q) q = "DELETE FROM scmlog WHERE id='%s'" % (scmlog_id) ExecuteQuery(q)
def get_url(): """Get the URL from which the data source was gathered""" q = "SELECT url, name as type FROM trackers t JOIN "+\ "supported_trackers s ON t.type = s.id limit 1" return (ExecuteQuery(q))
def verboseThread(self): # TODO: at some point these numbers should be calculated when # retrieving the initial list of message_id, is_response_of values # Returns the most verbose thread (the biggest emails) if self.verbose == None: # variable was not initialize self.verbose = "" current_len = 0 # iterating through the root messages for message_id in self.threads.keys(): total_len_bodies = 0 # len of all of the body messages # iterating through each of the messages of the thread for msg in self.threads[message_id]: query = """ select length(message_body) as length from messages where message_ID = '%s' """ % (msg) result = ExecuteQuery(query) length = int(result["length"]) total_len_bodies = total_len_bodies + length if total_len_bodies > current_len: # New bigger thread found self.verbose = message_id current_len = total_len_bodies return Email(self.verbose, self.i_db)
def get_date_init(startdate=None, enddate=None, identities_db=None, type_analysis=None): """Get the date of the first activity in the data source""" q = "SELECT DATE_FORMAT (MIN(created_on), '%Y-%m-%d') AS first_date FROM projects" return (ExecuteQuery(q))
def get_date_end(startdate=None, enddate=None, identities_db=None, type_analysis=None): """Get the date of the last activity in the data source""" q = "SELECT DATE_FORMAT (MAX(date),'%Y-%m-%d') as last_date FROM repositories_log" return (ExecuteQuery(q))
def GetPeopleStaticITS (developer_id, startdate, enddate, closed_condition) : ## FIXME is this function used only to calculate closed issues? if not it must be ## fixed q = GetPeopleQueryITS(developer_id, None, startdate, enddate, False, closed_condition) data = ExecuteQuery(q) return (data)
def _init_threads(self): # Returns dictionary of message_id threads. Each key contains a list # of emails associated to that thread (not ordered). # Retrieving all of the messages. query = """ select DISTINCT message_ID, is_response_of from messages where first_date >= %s and first_date < %s """ % (self.initdate, self.enddate) list_messages = ExecuteQuery(query) to_list = lambda x: [x] if type(x) not in (list, dict) else x self.list_message_id = to_list(list_messages["message_ID"]) self.list_is_response_of = to_list(list_messages["is_response_of"]) messages = {} for message_id in self.list_message_id: # Looking for messages in the thread index = self.list_message_id.index(message_id) # Only analyzing those whose is_response_of is None, # those are the message 'root' of each thread. if self.list_is_response_of[index] is None: messages[message_id] = self._build_threads(message_id) # Adding the root message to the list in first place messages[message_id].insert(0, message_id) self.threads = messages
def people(): # List of people participating in the source code development q = "select id,identifier from upeople" data = ExecuteQuery(q) return (data)
def GetListPeopleIRC(startdate, enddate): fields = "DISTINCT(pup.uuid) as id, count(irclog.id) total" tables = GetTablesOwnUniqueIdsIRC() filters = GetFiltersOwnUniqueIdsIRC() filters += " AND irclog.type='COMMENT' " filters += " GROUP BY nick ORDER BY total desc" q = GetSQLGlobal('date', fields, tables, filters, startdate, enddate) return (ExecuteQuery(q))
def remove_filter_data(filter_): uri = filter_.get_item() logging.info("Removing ITS filter %s %s" % (filter_.get_name(), filter_.get_item())) q = "SELECT * from trackers WHERE url='%s'" % (uri) repo = ExecuteQuery(q) if 'id' not in repo: logging.error("%s not found" % (uri)) return def get_people_one_repo(field): return """ SELECT %s FROM (SELECT COUNT(DISTINCT(tracker_id)) AS total, %s FROM issues GROUP BY %s HAVING total=1) t """ % (field, field, field) logging.info("Removing people") ## Remove submitted_by that exists only in this repository q = """ SELECT DISTINCT(submitted_by) from issues WHERE tracker_id='%s' AND submitted_by in (%s) """ % (repo['id'], get_people_one_repo("submitted_by")) res = ExecuteQuery(q) for people_id in res['submitted_by']: ITS._remove_people(people_id) ## Remove assigned_to that exists only in this repository q = """ SELECT DISTINCT(assigned_to) from issues WHERE tracker_id='%s' AND assigned_to in (%s) """ % (repo['id'], get_people_one_repo("assigned_to")) res = ExecuteQuery(q) for people_id in res['assigned_to']: ITS._remove_people(people_id) # Remove people activity logging.info("Removing issues") q = "SELECT id from issues WHERE tracker_id='%s'" % (repo['id']) res = ExecuteQuery(q) for issue_id in res['id']: ITS._remove_issue(issue_id) # Remove filter q = "DELETE from trackers WHERE id='%s'" % (repo['id']) ExecuteQuery(q)
def GetListPeopleMediaWiki (startdate, enddate) : fields = "DISTINCT(pup.uuid) as id, count(wiki_pages_revs.id) total" tables = GetTablesOwnUniqueIdsMediaWiki() filters = GetFiltersOwnUniqueIdsMediaWiki() filters += " GROUP BY user ORDER BY total desc" q = GetSQLGlobal('date',fields,tables, filters, startdate, enddate) data = ExecuteQuery(q) return (data)
def GetListPeopleMLS(startdate, enddate): fields = "DISTINCT(pup.uuid) as id, count(m.message_ID) total" tables = GetTablesOwnUniqueIdsMLS() filters = GetFiltersOwnUniqueIdsMLS() filters += " GROUP BY id ORDER BY total desc" q = GetSQLGlobal('first_date', fields, tables, filters, startdate, enddate) data = ExecuteQuery(q) return (data)
def GetPeopleListITS(startdate, enddate): fields = "DISTINCT(pup.uuid) as pid, count(c.id) as total" tables = GetTablesOwnUniqueIdsITS() filters = GetFiltersOwnUniqueIdsITS() filters += " GROUP BY pid ORDER BY total desc" q = GetSQLGlobal('changed_on', fields, tables, filters, startdate, enddate) data = ExecuteQuery(q) return (data)
def GetPeopleListSCM(startdate, enddate): fields = "DISTINCT(pup.uuid) as pid, COUNT(distinct(s.id)) as total" tables = GetTablesOwnUniqueIdsSCM() filters = GetFiltersOwnUniqueIdsSCM() filters += " GROUP BY pid ORDER BY total desc, pid" q = GetSQLGlobal('s.author_date', fields, tables, filters, startdate, enddate) data = ExecuteQuery(q) return (data)
def reposField(): # Depending on the mailing list, the field to be # used is mailing_list or mailing_list_url rfield = 'mailing_list' sql = "select count(distinct(mailing_list)) from messages" mailing_lists = ExecuteQuery(sql) if (len(mailing_lists) == 0): rfield = "mailing_list_url" return (rfield)
def get_date_end(startdate=None, enddate=None, identities_db=None, type_analysis=None): """Get the date of the last activity in the data source""" q1 = "SELECT MAX(updated_on) as ru, MAX(created_on) as rc FROM releases" q2 = "SELECT MAX(updated_on) as pu, MAX(created_on) as pr FROM projects" q = "SELECT DATE_FORMAT (last_date,'%Y-%m-%d') as last_date FROM " q += "(SELECT GREATEST(ru, rc, pu, pr) AS last_date FROM (%s) r, (%s) p) t" % ( q1, q2) return (ExecuteQuery(q))
def get_date_end(startdate=None, enddate=None, identities_db=None, type_analysis=None): """Get the date of the last activity in the data source""" q1 = "SELECT MAX(added_at) AS aq FROM questions" q2 = "SELECT MAX(submitted_on) AS sc FROM comments" q3 = "SELECT MAX(submitted_on) AS sa FROM answers" q = "SELECT DATE_FORMAT (GREATEST(aq, sc, sa), '%%Y-%%m-%%d') AS last_date FROM (%s) q, (%s) c, (%s) a" % ( q1, q2, q3) return (ExecuteQuery(q))
def top_files_modified(): # Top 10 modified files #FIXME: to be updated to use stardate and enddate values q = "select file_name, count(commit_id) as modifications "+\ "from action_files a join files f on a.file_id = f.id "+\ "where action_type='M' "+\ "group by f.id "+\ "order by modifications desc limit 10; " data = ExecuteQuery(q) return (data)
def remove_filter_data(filter_): uri = filter_.get_item() logging.info("Removing SCM filter %s %s" % (filter_.get_name(), filter_.get_item())) q = "SELECT * from repositories WHERE uri='%s'" % (uri) repo = ExecuteQuery(q) if 'id' not in repo: logging.error("%s not found" % (uri)) return # Remove people def get_people_one_repo(field): return """ SELECT %s FROM (SELECT COUNT(DISTINCT(repository_id)) AS total, %s FROM scmlog GROUP BY %s HAVING total=1) t """ % (field, field, field) ## Remove committer_id that exists only in this repository q = """ SELECT DISTINCT(committer_id) from scmlog WHERE repository_id='%s' AND committer_id in (%s) """ % (repo['id'], get_people_one_repo("committer_id")) res = ExecuteQuery(q) for people_id in res['committer_id']: SCM._remove_people(people_id) ## Remove author_id that exists only in this repository q = """ SELECT DISTINCT(author_id) from scmlog WHERE repository_id='%s' AND author_id in (%s) """ % (repo['id'], get_people_one_repo("author_id")) res = ExecuteQuery(q) for people_id in res['author_id']: SCM._remove_people(people_id) # Remove people activity q = "SELECT id from scmlog WHERE repository_id='%s'" % (repo['id']) res = ExecuteQuery(q) for scmlog_id in res['id']: SCM._remove_scmlog(scmlog_id) # Remove files q = "SELECT id FROM files WHERE repository_id='%s'" % (repo['id']) res = ExecuteQuery(q) for file_id in res['id']: q = "DELETE FROM file_types WHERE file_id='%s'" % (file_id) ExecuteQuery(q) q = "DELETE FROM files WHERE id='%s'" % (file_id) ExecuteQuery(q) # Remove filter q = "DELETE from repositories WHERE id='%s'" % (repo['id']) ExecuteQuery(q)
def GetTopClosersByAssignee (days, startdate, enddate, identities_db, filter) : affiliations = "" for aff in filter: affiliations += " org.name<>'"+ aff +"' and " date_limit = "" if (days != 0 ) : sql = "SELECT @maxdate:=max(changed_on) from changes limit 1" ExecuteQuery(sql) date_limit = " AND DATEDIFF(@maxdate, changed_on)<"+str(days) q = "SELECT up.uuid as id, "+\ " up.identifier as closers, "+\ " count(distinct(ill.issue_id)) as closed "+\ "FROM people_uidentities pup, "+\ " "+ identities_db+ ".enrollments enr, "+\ " "+ identities_db+ ".uidentities up, "+\ " "+ identities_db+ ".organizations org, "+\ " issues_log_launchpad ill "+\ "WHERE ill.assigned_to = pup.people_id and "+\ " pup.uuid = up.uuid and "+\ " up.uuid = enr.uuid and "+\ " enr.organization_id = org.id and "+\ " "+ affiliations+ " "+\ " ill.date >= enr.start and "+\ " ill.date < enr.end and "+\ " ill.change_id in ( "+\ " select id "+\ " from changes "+\ " where new_value='Fix Committed' and "+\ " changed_on>="+ startdate+ " and "+\ " changed_on<"+ enddate+ " "+ date_limit+") "+\ "GROUP BY up.identifier "+\ "ORDER BY closed desc, closers limit 10" data = ExecuteQuery(q) return (data)
def GetPeopleListSCR (startdate, enddate, bots): filter_bots = "" for bot in bots: filter_bots += " name<>'"+bot+"' and " fields = "DISTINCT(pup.uuid) as id, count(i.id) as total, name" tables = GetTablesOwnUniqueIdsSCR('issues') + ", people" filters = filter_bots filters += GetFiltersOwnUniqueIdsSCR('issues')+ " and people.id = pup.people_id" filters += " GROUP BY id ORDER BY total desc" q = GetSQLGlobal('submitted_on', fields, tables, filters, startdate, enddate) return(ExecuteQuery(q))
def GetDate(startdate, enddate, identities_db, type_analysis, type): # date of submmitted issues (type= max or min) if (type == "max"): fields = " DATE_FORMAT (max(date), '%Y-%m-%d') as last_date" else: fields = " DATE_FORMAT (min(date), '%Y-%m-%d') as first_date" tables = " irclog i " + GetIRCSQLReportFrom(identities_db, type_analysis) filters = GetIRCSQLReportWhere(type_analysis) q = BuildQuery(None, startdate, enddate, " i.date ", fields, tables, filters, False) data = ExecuteQuery(q) return (data)
def GetEmailsSent(period, startdate, enddate, identities_db, type_analysis, evolutionary, projects_db): # Generic function that counts emails sent if (evolutionary): fields = " count(distinct(m.message_ID)) as sent " else: fields = " count(distinct(m.message_ID)) as sent, "+\ " DATE_FORMAT (min(m.first_date), '%Y-%m-%d') as first_date, "+\ " DATE_FORMAT (max(m.first_date), '%Y-%m-%d') as last_date " tables = " messages m " + GetMLSSQLReportFrom(identities_db, type_analysis) filters = GetMLSSQLReportWhere(type_analysis, projects_db) q = BuildQuery(period, startdate, enddate, " m.first_date ", fields, tables, filters, evolutionary) return (ExecuteQuery(q))
def topCrowdedThread(self, numTop): # Returns list ordered by the longest thread top_threads = [ ] # [(message_id, number of different upeople_id), (...,...), ...] for thread in self.threads.values(): # this loop counts number of different people # in each of the threads and provides a # dictionary with root message_id as each of the keys # and a list of upeople_id as the value. # Sets were considered as an option, but it implies that # we may find with a higher probability equal sets, what # would provide incorrect sets to their correspondant message_id # when ordering them (at least using this algorithm). # So, not using sets, and manual order of the lists is done people = set([]) for message in thread: query = """ select distinct pup.uuid as upeople_id from messages m, messages_people mp, people_uidentities pup where m.message_ID = '%s' and m.message_ID = mp.message_id and mp.type_of_recipient = 'From' and mp.email_address = pup.people_id """ % (message) result = ExecuteQuery(query) upeople_id = result["upeople_id"] people.add(upeople_id) top_threads.append((message, len(people))) sorted_threads = sorted(top_threads, key=lambda thread: thread[1], reverse=True) top_threads_emails = [] for top in sorted_threads[:numTop]: # Create a list of emails message_id = top[0] email = Email(message_id, self.i_db) top_threads_emails.append((email, top[1])) return top_threads_emails
def get_people_query(developer_id, startdate, enddate, evol=False, period=None): query_builder = Pullpo.get_query_builder() fields = 'COUNT(distinct(pr.id)) AS submissions' tables = 'pull_requests pr, people_uidentities pup' filters = 'pr.user_id = pup.people_id' filters += " AND pup.uuid='" + str(developer_id) + "'" if (evol): q = GetSQLPeriod(period, 'pr.created_at', fields, tables, filters, startdate, enddate) else: fields += ",DATE_FORMAT (min(pr.created_at),'%Y-%m-%d') as first_date, "+\ "DATE_FORMAT (max(pr.created_at),'%Y-%m-%d') as last_date" q = GetSQLGlobal('pr.created_at', fields, tables, filters, startdate, enddate) data = ExecuteQuery(q) return (data)
def GetPersonIdentifiers(identities_db, upeople_id): """ Get people, company and country information """ res = None q = """ SELECT pro.uuid, pro.name, pro.email, cou.name as country, org.name as affiliation FROM %s.profiles pro LEFT JOIN ( SELECT * FROM %s.enrollments WHERE (uuid, end) IN ( SELECT uuid, MAX(end) FROM %s.enrollments GROUP BY uuid )) enr ON enr.uuid = pro.uuid LEFT JOIN %s.organizations org ON org.id = enr.organization_id LEFT JOIN %s.countries cou ON cou.code = pro.country_code WHERE pro.uuid ='%s' LIMIT 1 """ % (identities_db, identities_db, identities_db, identities_db, identities_db, upeople_id) res = ExecuteQuery(q) return res
def _buildEmail(self): # This method retrieves items of information of a given # email, specified by its email id. query = """ select distinct m.message_ID, m.subject, m.message_body, m.first_date, pro.name as initiator_name, u.uuid as initiator_id, m.mailing_list_url as url from messages m, messages_people mp, people_uidentities pup, %s.uidentities u, %s.profiles pro where m.message_ID = '%s' and m.message_ID = mp.message_id and mp.type_of_recipient = 'From' and mp.email_address = pup.people_id and pup.uuid = u.uuid and pup.uuid = pro.uuid limit 1 """ % (self.i_db, self.i_db, self.message_id) # WARNING: There may appear in some cases repeated emails. # This may be because the same email was sent to different # mailing lists. Forcing the query to 1 row, allows to # avoid this issue till we understand why this behaviour results = ExecuteQuery(query) self.subject = results["subject"] self.body = results["message_body"] self.date = results["first_date"] self.initiator_name = results["initiator_name"] self.initiator_id = results["initiator_id"] self.url = results["url"]
def GetPeopleIRC(): # Returns the ids of the IRC participants q = "SELECT DISTINCT(uuid) AS members FROM people_upeople" data = ExecuteQuery(q) return (data['members'])