Beispiel #1
0
    class GitData(object):
        """
        This class is for getting contribution, users and other data from
        Git repository.
        """
        def __init__(self, uri, branch="master", project_name="project",
                         specific_sha=None, threshold=False, correction=False):
            self._data_frame = None
            self.files = {}
            self.project_name = project_name
            self.git_repository = uri
            git_url = re.compile(GIT_URL)
            _uri_safe = ''.join([c for c in uri if c.isalnum()])
            self.repo_path = os.path.join(TMP_DIR, _uri_safe)
            self.__tmp_repository = self.repo_path
            self.index_sha = 0
            self.size = 0
            self.__first = True
            self.__spec_file = []
            self.specific_sha = specific_sha
            if os.path.exists(self.repo_path):
                #dont use cached repo
                shutil.rmtree(self.repo_path)
            if os.path.exists("diff_{0}.txt".format(self.project_name)):
                os.remove("diff_{0}.txt".format(self.project_name))
            is_url = git_url.search(uri)
            if is_url is None:
                self.__repository = Gittle(self.git_repository)
                self.__tmp_repository = self.git_repository
            else:
                if self.git_repository.find(".git") < 0:
                    LOGGER.info(r"Must end .git i will add manualy")
                    self.git_repository += ".git"
                try:
                    LOGGER.info(r'Cloning git repo: {0}'.format(self.repo_path))
                    Gittle.clone(self.git_repository, self.__tmp_repository)
                except InvalidRemoteUrl as err:
                    raise Exception(r"Could not clone repository! Is not url."
                        " Error: {0}".format(err))
                except ValueError as err:
                    raise Exception(r"Is not url."
                        " Error: {0}".format(err))
                except KeyError as err:
                    raise Exception(r"Could not clone repository."
                        " Error: {0}".format(err))
                except RefFormatError:
                    n_path = "/tmp/{0}".format(_uri_safe)
                    if os.path.exists(n_path):
                        #dont use cached repo
                        shutil.rmtree(n_path)
                    if branch is None:
                        os.system("git clone {0} {1} 2>&1".format(uri, n_path))
                    else:
                        os.system("git clone -b {0} {1} {2} 2>&1"
                                        .format(branch, uri, n_path))
                    self.__tmp_repository = n_path
                self.__repository = Gittle(self.__tmp_repository, origin_uri=uri)
                self.__repository.DEFAULT_BRANCH = branch
            if branch not in self.__repository.branches:
                LOGGER.error("Branch {0} is no in {1}".format(branch, uri))
                raise Exception("Branch {0} is no in {1}".format(branch, uri))
            self.__fill_data(branch, specific_sha)
            self.eval_commit_to_future(threshold, correction)

        def return_repository_path(self):
            """ This method returns path to tmp repository"""
            return self.__tmp_repository

        def __fill_data(self, branch, specific_sha):
            """ This method fill and parsing data to DataFrame."""
            LOGGER.info("Filling data to _data_frame")
            self._data_frame = DataFrame(self.__repository.commit_info(branch=branch))
            try:
                __branch = [sha.id for sha in
                    self.__repository.branch_walker(branch)]
            except ValueError:
                raise Exception(r"This repository dont have {0} branch".format
                                (branch))
            LOGGER.info(r"Go through master branch and using gittle.diff for"
            "getting diff output")
            if specific_sha is None:
                __branch = __branch[::-1]
                self.size = len(__branch)
                self.diff_for_shas(__branch)
            else:
                __branch = __branch[::-1]
                after_comm = [idx for idx, found in enumerate(__branch)
                                        if found.find(specific_sha) >= 0]
                if not any(after_comm):
                    raise Exception("This sha {0} is not in this repo.".format(specific_sha))
                after_sha = __branch[after_comm[0]:]
                self.size = len(after_sha)
                self.diff_for_shas(after_sha)

        def diff_for_shas(self, list_shas):
            """
            Method for itereting through list od shas and call _diff
            method.
            """
            for idx, sha in enumerate(list_shas):
                diff = None
                diff = self.__repository.diff(sha)
                if diff is None or not any(diff):
                    continue
                self._diff({"sha": sha,
                             "diff": diff,
                            "index": idx
                            })

        def _diff(self, params):
            """
            This method take diff and returns from this output added and
            removed lines for evaluation. Also creates file with diff output.
            """
            author = self.find_author_by_sha(params["sha"])
            rtime = self.find_time_by_sha(params["sha"])
            diff = params["diff"]
            for dict_diff in diff:
                fname = dict_diff["new"]["path"]
                if fname == '' or fname is None:
                    fname = dict_diff["old"]["path"]
                if re.search(r".*\.py$", fname) is None:
                    continue
                if re.search(r"setup.py", fname) is not None:
                    continue
                if self.specific_sha is not None:
                    if self.__first:
                        self.__spec_file.append(fname)
                    elif fname not in self.__spec_file:
                        continue
                lines = dict_diff["diff"].split("\n")
                list_lines = []
                list_added = []
                list_removed = []
                add_hashed = []
                rem_hashed = []
                line_num, rem_line = 0, 0
                removed, added, changed = 0, 0, 0
                #counting for graphs is from 0 no from 1
                diff = "\nindex: {0}".format(params["index"])
                diff += " sha: {0}".format(params["sha"])
                diff += " date: {0}\n".format(time.ctime(rtime))
                diff += "LN\tRL\tDIFF\n"
                name_diff = "diff_{0}.txt".format(self.project_name)
                with open(name_diff, "a") as diff_file:
                    diff_file.write(diff)
                    for line in lines:
                        if len(line) <= 0 or line is None:
                            continue
                        if (line.startswith('diff ') or
                            line.startswith('index ') or
                            line.startswith('--- ') or
                            line.startswith('+++ ') or
                            line.startswith('new mode')):
                            continue
                        if line.startswith('@@ '):
                            _, old_nr, new_nr, _ = line.split(' ', 3)
                            line_num = abs(int(new_nr.split(',')[0]))
                            rem_line = abs(int(old_nr.split(',')[0]))
                            continue
                        if line.startswith(' '):
                            line_num += 1
                            rem_line += 1
                        if line[0] == '-':
                            removed += 1
                            list_removed.append(rem_line)
                            rem_hashed.append(line[1:].encode('base64', 'strict'))
                            rem_line += 1
                        if line[0] == '+':
                            added += 1
                            list_added.append(line_num)
                            add_hashed.append(line[1:].encode('base64', 'strict'))
                            list_lines.append(line_num)
                            line_num += 1
                        diff_file.write("{0}\t{1}\t{2}\n".format((line_num - 1),
                                        (rem_line - 1), line))
                changed = added - abs(removed)
                dict_df = [{
                            "added_lines": list_added,
                            "removed_lines": list_removed,
                            "hash_added": add_hashed,
                            "hash_removed": rem_hashed,
                            "author": author,
                            "sha": params["sha"],
                            "range": params["index"],
                            "rating_one": 100,
                            "rating_two": 100,
                            "removed_counter": removed,
                            "added_counter" : added,
                            "changed_lines": changed,
                            "modification": 0.0,
                            "time": rtime,
                            "file": fname
                            }]
                if fname in self.files:
                    __tmp = DataFrame(dict_df)
                    self.files[fname] = self.files[fname].append(__tmp, ignore_index=True)
                else:
                    self.files[fname] = DataFrame(dict_df)
                self.__first = False

        def eval_commit_to_future(self, thresh_fl, correction):
            """
            Using this method I will check through all the saved commits and
            evaluate the theoretic quality of these commits. Both algorithms
            used here are based on a simple principle of added and removed
            lines. According to these algorithms, there is a calculation of
            the rating.
            Output data structure is:
            sha1-> range, removed_counter, added_counter, added_lines,
            removed_lines, rating_one, rating_two etc.
            sha2-> range, removed_counter, added_counter, added_lines,
            removed_lines, rating etc.
            etc.
            The direction of rating goes from the first commit to the last one.
            There are two different ways. The first algorithm ends if there are
            no lines left of if it reaches the end of list. After that the rate
            is being calculated.
            The second algorithm is set for default rating value of 100. It
            subtracts value from this default one and is calculated this way:
            value = (100/added_lines) * removed lines
            """
            for file_name in self.files.keys():
                file_ = self.files[file_name]
                #get size of df
                size = file_.count().values[0]
                for row in xrange(size):
                    #start index
                    start_index = row + 1
                    # how many commits we must iterate
                    max_size = size - start_index
                    added_lines = list(file_.at[row, "hash_added"])
                    threshold = max_size * 0.1
                    rating = 4
                    #special case when was only removing
                    if not any(added_lines):
                        continue
                    rating_two = 100
                    minus_val = len(added_lines)
                    for idx in xrange(start_index, size):
                        for line in (file_.at[idx, "hash_removed"]):
                            if line in added_lines:
                                added_lines.remove(line)
                                rating_two -= (100 / minus_val)
                                added_lines = added_lines
                        if len(added_lines) <= 0:
                            range_ = idx - start_index
                            rating = calculate_rating(range_, threshold)
                            break
                        if thresh_fl and idx >= (threshold * 5):
                            rating = 4
                            break
                    first_added = len(file_.at[row, "hash_added"])
                    if correction:
                        if rating > 1 and len(added_lines) < first_added / 4:
                            rating -= 2
                        elif rating > 0 and len(added_lines) < first_added / 2:
                            rating -= 1
                    rating = (rating / 4) * 100
                    self.files[file_name].at[row, "rating_one"] = rating
                    self.files[file_name].at[row, "rating_two"] = rating_two

        def find_author_by_sha(self, sha):
            """This method finds the author by sha in dataFrame. If not found
               return None.
            """
            index = self._data_frame[self._data_frame.sha == sha]["author"]
            if sha == '' or sha == []:
                return None
            try:
                return index.values[0]["name"]
            except IndexError:
                LOGGER.warning(r"Sha {0}, {1} is not in data frame.".format(sha, index))
            return None

        def find_time_by_sha(self, sha):
            """This method finds timestamp by sha in dataFrame. If not found
               return None.
            """
            index = self._data_frame[self._data_frame.sha == sha].index
            if sha == '' or sha == []:
                return None
            try:
                return self._data_frame.time[index].values[0]
            except IndexError:
                LOGGER.warning("Sha {0}, {1} is not in data frame.".format(sha, index))
            return None

        def rollback(self, sha):
            """
            This method will make rollback to version which is set by sha.
            """
            try:
                self.__repository.checkout_all(sha)
            except IOError:
                LOGGER.warning("Couldn't rollback on sha {0}.".format(sha))
            except KeyError:
                LOGGER.warning("Couldn't rollback on sha {0}.".format(sha))

        def get_git_data(self):
            """ This method returns data frame for project or None. """
            return (self._data_frame, self.files)