def handle(self, *args, **options): projects = Project.objects.all() for project in projects: print('Checking {0} for new commits'.format(project)) if project.git_url: repo_path = '/tmp/' + project.slug try: repo = Gittle(repo_path, project.git_url) repo.pull() except: try: repo = Gittle.clone(project.git_url, repo_path) except: # put some logging here repo = None if repo: new_commits = [] for commit in repo.commit_info(): try: prev_commit = repo.get_previous_commit( commit['sha']) time = (datetime.fromtimestamp(commit['time']) + timedelta(hours=commit['timezone'] / (60 * 60))).replace( tzinfo=pytz.utc) try: user_author = get_user_model().objects.get( email=commit['author']['email']) string_author = None except: string_author = commit['author']['name'] user_author = None summary = commit['message'].split('\n')[0][:45] pcommit = ProjectCommit.objects.create( project=project, chash=commit['sha'], message=commit['message'], summary=summary, user_author=user_author, string_author=string_author, created=time, time=time, diff=repo.diff(commit['sha'], prev_commit).next()['diff']) print(pcommit, ' added.') new_commits.append(pcommit) except: pass
def handle(self, *args, **options): projects = Project.objects.all() for project in projects: print('Checking {0} for new commits'.format(project)) if project.git_url: repo_path = '/tmp/' + project.slug try: repo = Gittle(repo_path, project.git_url) repo.pull() except: try: repo = Gittle.clone(project.git_url, repo_path) except: # put some logging here repo = None if repo: new_commits = [] for commit in repo.commit_info(): try: prev_commit = repo.get_previous_commit(commit['sha']) time = (datetime.fromtimestamp(commit['time']) + timedelta(hours=commit['timezone']/(60*60))).replace(tzinfo=pytz.utc) try: user_author = get_user_model().objects.get(email=commit['author']['email']) string_author = None except: string_author = commit['author']['name'] user_author = None summary = commit['message'].split('\n')[0][:45] pcommit = ProjectCommit.objects.create( project=project, chash=commit['sha'], message=commit['message'], summary=summary, user_author=user_author, string_author=string_author, created=time, time=time, diff=repo.diff(commit['sha'], prev_commit).next()['diff'] ) print(pcommit, ' added.') new_commits.append(pcommit) except: pass
from gittle import Gittle repo = Gittle('.') lastest = [info['sha'] for info in repo.commit_info()[1:3]] print(repo.diff(*lastest, diff_type='classic')) print(""" Last Diff """) print(list(repo.diff('HEAD')))
class GitData(object): """ This class is for getting contribution, users and other data from Git repository. """ def __init__(self, uri, branch="master", project_name="project", specific_sha=None, threshold=False, correction=False): self._data_frame = None self.files = {} self.project_name = project_name self.git_repository = uri git_url = re.compile(GIT_URL) _uri_safe = ''.join([c for c in uri if c.isalnum()]) self.repo_path = os.path.join(TMP_DIR, _uri_safe) self.__tmp_repository = self.repo_path self.index_sha = 0 self.size = 0 self.__first = True self.__spec_file = [] self.specific_sha = specific_sha if os.path.exists(self.repo_path): #dont use cached repo shutil.rmtree(self.repo_path) if os.path.exists("diff_{0}.txt".format(self.project_name)): os.remove("diff_{0}.txt".format(self.project_name)) is_url = git_url.search(uri) if is_url is None: self.__repository = Gittle(self.git_repository) self.__tmp_repository = self.git_repository else: if self.git_repository.find(".git") < 0: LOGGER.info(r"Must end .git i will add manualy") self.git_repository += ".git" try: LOGGER.info(r'Cloning git repo: {0}'.format(self.repo_path)) Gittle.clone(self.git_repository, self.__tmp_repository) except InvalidRemoteUrl as err: raise Exception(r"Could not clone repository! Is not url." " Error: {0}".format(err)) except ValueError as err: raise Exception(r"Is not url." " Error: {0}".format(err)) except KeyError as err: raise Exception(r"Could not clone repository." " Error: {0}".format(err)) except RefFormatError: n_path = "/tmp/{0}".format(_uri_safe) if os.path.exists(n_path): #dont use cached repo shutil.rmtree(n_path) if branch is None: os.system("git clone {0} {1} 2>&1".format(uri, n_path)) else: os.system("git clone -b {0} {1} {2} 2>&1" .format(branch, uri, n_path)) self.__tmp_repository = n_path self.__repository = Gittle(self.__tmp_repository, origin_uri=uri) self.__repository.DEFAULT_BRANCH = branch if branch not in self.__repository.branches: LOGGER.error("Branch {0} is no in {1}".format(branch, uri)) raise Exception("Branch {0} is no in {1}".format(branch, uri)) self.__fill_data(branch, specific_sha) self.eval_commit_to_future(threshold, correction) def return_repository_path(self): """ This method returns path to tmp repository""" return self.__tmp_repository def __fill_data(self, branch, specific_sha): """ This method fill and parsing data to DataFrame.""" LOGGER.info("Filling data to _data_frame") self._data_frame = DataFrame(self.__repository.commit_info(branch=branch)) try: __branch = [sha.id for sha in self.__repository.branch_walker(branch)] except ValueError: raise Exception(r"This repository dont have {0} branch".format (branch)) LOGGER.info(r"Go through master branch and using gittle.diff for" "getting diff output") if specific_sha is None: __branch = __branch[::-1] self.size = len(__branch) self.diff_for_shas(__branch) else: __branch = __branch[::-1] after_comm = [idx for idx, found in enumerate(__branch) if found.find(specific_sha) >= 0] if not any(after_comm): raise Exception("This sha {0} is not in this repo.".format(specific_sha)) after_sha = __branch[after_comm[0]:] self.size = len(after_sha) self.diff_for_shas(after_sha) def diff_for_shas(self, list_shas): """ Method for itereting through list od shas and call _diff method. """ for idx, sha in enumerate(list_shas): diff = None diff = self.__repository.diff(sha) if diff is None or not any(diff): continue self._diff({"sha": sha, "diff": diff, "index": idx }) def _diff(self, params): """ This method take diff and returns from this output added and removed lines for evaluation. Also creates file with diff output. """ author = self.find_author_by_sha(params["sha"]) rtime = self.find_time_by_sha(params["sha"]) diff = params["diff"] for dict_diff in diff: fname = dict_diff["new"]["path"] if fname == '' or fname is None: fname = dict_diff["old"]["path"] if re.search(r".*\.py$", fname) is None: continue if re.search(r"setup.py", fname) is not None: continue if self.specific_sha is not None: if self.__first: self.__spec_file.append(fname) elif fname not in self.__spec_file: continue lines = dict_diff["diff"].split("\n") list_lines = [] list_added = [] list_removed = [] add_hashed = [] rem_hashed = [] line_num, rem_line = 0, 0 removed, added, changed = 0, 0, 0 #counting for graphs is from 0 no from 1 diff = "\nindex: {0}".format(params["index"]) diff += " sha: {0}".format(params["sha"]) diff += " date: {0}\n".format(time.ctime(rtime)) diff += "LN\tRL\tDIFF\n" name_diff = "diff_{0}.txt".format(self.project_name) with open(name_diff, "a") as diff_file: diff_file.write(diff) for line in lines: if len(line) <= 0 or line is None: continue if (line.startswith('diff ') or line.startswith('index ') or line.startswith('--- ') or line.startswith('+++ ') or line.startswith('new mode')): continue if line.startswith('@@ '): _, old_nr, new_nr, _ = line.split(' ', 3) line_num = abs(int(new_nr.split(',')[0])) rem_line = abs(int(old_nr.split(',')[0])) continue if line.startswith(' '): line_num += 1 rem_line += 1 if line[0] == '-': removed += 1 list_removed.append(rem_line) rem_hashed.append(line[1:].encode('base64', 'strict')) rem_line += 1 if line[0] == '+': added += 1 list_added.append(line_num) add_hashed.append(line[1:].encode('base64', 'strict')) list_lines.append(line_num) line_num += 1 diff_file.write("{0}\t{1}\t{2}\n".format((line_num - 1), (rem_line - 1), line)) changed = added - abs(removed) dict_df = [{ "added_lines": list_added, "removed_lines": list_removed, "hash_added": add_hashed, "hash_removed": rem_hashed, "author": author, "sha": params["sha"], "range": params["index"], "rating_one": 100, "rating_two": 100, "removed_counter": removed, "added_counter" : added, "changed_lines": changed, "modification": 0.0, "time": rtime, "file": fname }] if fname in self.files: __tmp = DataFrame(dict_df) self.files[fname] = self.files[fname].append(__tmp, ignore_index=True) else: self.files[fname] = DataFrame(dict_df) self.__first = False def eval_commit_to_future(self, thresh_fl, correction): """ Using this method I will check through all the saved commits and evaluate the theoretic quality of these commits. Both algorithms used here are based on a simple principle of added and removed lines. According to these algorithms, there is a calculation of the rating. Output data structure is: sha1-> range, removed_counter, added_counter, added_lines, removed_lines, rating_one, rating_two etc. sha2-> range, removed_counter, added_counter, added_lines, removed_lines, rating etc. etc. The direction of rating goes from the first commit to the last one. There are two different ways. The first algorithm ends if there are no lines left of if it reaches the end of list. After that the rate is being calculated. The second algorithm is set for default rating value of 100. It subtracts value from this default one and is calculated this way: value = (100/added_lines) * removed lines """ for file_name in self.files.keys(): file_ = self.files[file_name] #get size of df size = file_.count().values[0] for row in xrange(size): #start index start_index = row + 1 # how many commits we must iterate max_size = size - start_index added_lines = list(file_.at[row, "hash_added"]) threshold = max_size * 0.1 rating = 4 #special case when was only removing if not any(added_lines): continue rating_two = 100 minus_val = len(added_lines) for idx in xrange(start_index, size): for line in (file_.at[idx, "hash_removed"]): if line in added_lines: added_lines.remove(line) rating_two -= (100 / minus_val) added_lines = added_lines if len(added_lines) <= 0: range_ = idx - start_index rating = calculate_rating(range_, threshold) break if thresh_fl and idx >= (threshold * 5): rating = 4 break first_added = len(file_.at[row, "hash_added"]) if correction: if rating > 1 and len(added_lines) < first_added / 4: rating -= 2 elif rating > 0 and len(added_lines) < first_added / 2: rating -= 1 rating = (rating / 4) * 100 self.files[file_name].at[row, "rating_one"] = rating self.files[file_name].at[row, "rating_two"] = rating_two def find_author_by_sha(self, sha): """This method finds the author by sha in dataFrame. If not found return None. """ index = self._data_frame[self._data_frame.sha == sha]["author"] if sha == '' or sha == []: return None try: return index.values[0]["name"] except IndexError: LOGGER.warning(r"Sha {0}, {1} is not in data frame.".format(sha, index)) return None def find_time_by_sha(self, sha): """This method finds timestamp by sha in dataFrame. If not found return None. """ index = self._data_frame[self._data_frame.sha == sha].index if sha == '' or sha == []: return None try: return self._data_frame.time[index].values[0] except IndexError: LOGGER.warning("Sha {0}, {1} is not in data frame.".format(sha, index)) return None def rollback(self, sha): """ This method will make rollback to version which is set by sha. """ try: self.__repository.checkout_all(sha) except IOError: LOGGER.warning("Couldn't rollback on sha {0}.".format(sha)) except KeyError: LOGGER.warning("Couldn't rollback on sha {0}.".format(sha)) def get_git_data(self): """ This method returns data frame for project or None. """ return (self._data_frame, self.files)
class GitStat: ''' print " This class is for parsed git repository.\n\ Returns parsed dictionary filled by name of users/commiters which add/modify files.\n\ Every item has own directory which contains some counters: \ counter for adding files\n\ counter for deleting files\n\ counter for files which has been modify\n\ Next:\n \ all git commits in array\n\ directory which lines in which files was modify and how many times who\n" ''' def __init__(self,git_path,rskey= None,logger=dg): self.User = {} self.Commits = {} self.__logger = logger self.__tmp_repository = "/tmp/temporary_git_repository" if(os.path.exists(self.__tmp_repository)): self.__tmp_repository = self.__tmp_repository+"_"+datetime.datetime.now().isoformat() print git_path try: Gittle.clone(git_path,self.__tmp_repository) except: pass #self.__logger("Error could not clone repository.") #return self.__repository = Gittle(self.__tmp_repository) if rskey != None: key_file = open(rskey) self.__repository.auth(pkey=key_file) #print self.__tmp_repository self.fill_User() def parse(self,name,commit,time): #print "asd",name, commit file_pattern = re.compile("diff --git a/(.*) b/(.*)") line_pattern = re.compile("@@ (.*) (.*) @@") regex = file_pattern.search(commit) file1,file2 = regex.group(1),regex.group(2) regex = line_pattern.search(commit) line1,line2 = regex.group(1),regex.group(2) #self.__logger("Files : ",file1,file2,line1,line2) # print self.User[name].has_key(file1) if self.User[name].has_key(file1): #self.User[name][file1]['line_str'].append(line1+" "+line2) if self.User[name][file1].has_key(line1): self.User[name][file1][line1]["counter"] += 1 self.User[name][file1][line1]['time'].append(time) else: self.User[name][file1][line1] = {} self.User[name][file1][line1]["counter"] = 1 self.User[name][file1][line1]['time'] = [] self.User[name][file1][line1]['time'].append(time) self.User[name][file1]['modify'] += 1 self.User[name][file1]['time'].append(time) if os.path.isfile(self.__tmp_repository+'/'+file1): self.User[name][file1]['exist'] = True else: self.User[name][file1]['exist'] = False self.Commits[name][file1]['array_commits'].append(commit) self.Commits[name][file1]['time'].append(time) else: self.User[name][file1] = {} self.User[name][file1][line1] = {} self.User[name][file1][line1]["counter"] = 1 self.User[name][file1][line1]['time'] = [] self.User[name][file1][line1]['time'].append(time) self.User[name][file1]['modify'] = 1 self.User[name][file1]['time'] = [] self.User[name][file1]['time'].append(time) if os.path.isfile(self.__tmp_repository+'/'+file1): self.User[name][file1]['exist'] = True else: self.User[name][file1]['exist'] = False self.Commits[name][file1] = {} self.Commits[name][file1]['array_commits'] = [] self.Commits[name][file1]['array_commits'].append(commit) self.Commits[name][file1]['time'] = [] self.Commits[name][file1]['time'].append(time) return file1 def fill_User(self): for commit in self.__repository.commit_info(): sha = [commit['sha']] #print commit try: if not self.User.has_key(commit['committer']['name']): #print commit#['committer']['name'] #print self.__repository.diff(*sha)[0]['diff'] self.User[commit['committer']['name']] = {} self.Commits[commit['committer']['name']] = {} self.parse(commit['committer']['name'],self.__repository.diff(*sha)[0]['diff'],commit['time']) else: self.parse(commit['committer']['name'],self.__repository.diff(*sha)[0]['diff'],commit['time']) except: pass #return file1 def return_User(self): # print self.User return self.User def return_commits(self): return self.Commits
from gittle import Gittle repo = Gittle('.') lastest = [ info['sha'] for info in repo.commit_info()[1:3] ] print(repo.diff(*lastest, diff_type='classic')) print(""" Last Diff """) print(list(repo.diff('HEAD')))