def git_pull(remote_url, local, idc_dir): """ @remote: git remote address @local: local git repo dir """ commit_msg = None if not os.path.exists(local): os.mkdir(local) logging.info('git start pull from remote repo.') if not os.path.exists( "{0}/.git".format(local)) or not git.repo.fun.is_git_dir( "{0}/.git".format(local)): repo_clone = Repo.clone_from(remote_url, local) if not repo_clone: logging.debug("clone repo from {0} to {1}failed.\n".format( remote_url, local)) return repo = Repo(local) remote = None git_commit = git.Git(idc_dir) last_commit_log = "/var/log/last_commit.log" last_commit = None if os.path.exists(last_commit_log): with file(last_commit_log) as fhand: last_log_commit = fhand.read().strip() # Warning: for performance only check 500 commit, use this write mysql for commit in repo.iter_commits('master', max_count=500): if last_commit == commit.hexsha: last_commit = last_log_commit break if not last_commit: last_commit = repo.head.commit.hexsha with FileLock(os.path.split(local)[1]) as _flock: for remote_name in repo.remotes: if remote_name.url == remote_url: remote = remote_name break if remote is None: remote_name = "%x" % (random.randint(0, 1000000) ^ int(time.time())) remote = repo.create_remote(remote_name, remote_url) ret = 0 try: info_list = remote.pull() for info in info_list: ret = info.flags & (git.FetchInfo.REJECTED | git.FetchInfo.ERROR) if ret > 0: logging.warning("[conf-pull] pull from remote error!\n") commit_msg = git_commit.log("{0}..HEAD".format(last_commit), "--pretty=%H,%ai,%s", idc_dir) except (git.GitCommandError, git.RepositoryDirtyError), e: logging.warning("git pull error, error={0}\n".format(str(e))) except Exception: logging.warning("git pull error, other exception.\n")
def getCommits(repo_name): shas = set() commits_list = [] repo = Repo(os.path.join(work_dir, repo_name)) for b in repo.remote().fetch(): if '/' not in b.name: continue print("start to check branch {} of {}".format(b.name, repo_name)) branch_name = b.name.split('/')[1] repo.git.checkout('-B', branch_name, b.name) commits = list(repo.iter_commits()) for idx, commit in enumerate(commits): if str(commit) in shas: continue else: shas.add(str(commit)) commit_collection = {} xml_cnt = 0 kot_jav_cnt = 0 file_list = list(commit.stats.files) for file in file_list: if len(file) > 4 and file[-4:] == '.xml': xml_cnt += 1 elif len(file) > 3 and file[-3:] == '.kt' or len( file) > 5 and file[-5:] == '.java': kot_jav_cnt += 1 if xml_cnt >= 1 and kot_jav_cnt >= 1: commit_collection["commit_id"] = str(commit) commit_collection["commit_msg"] = str.strip(commit.message) commit_collection["commit_time"] = str( commit.committed_datetime) commit_collection["committer_name"] = commit.author.name commit_collection["committer_email"] = commit.author.email diff_files = [] if not commit.parents: continue else: for diff in commit.parents[0].diff(commit): diff_file = {} diff_file["file_path"] = diff.a_path diff_file["change_type"] = diff.change_type diff_file["lang"] = os.path.splitext( diff.a_path)[1][1:] diff_files.append(diff_file) commit_collection["diff_files"] = diff_files commit_collection["parent_commit_num"] = len(commit.parents) commits_list.append(commit_collection) repo_dump = json.dumps(commits_list, indent=2, ensure_ascii=False) with open(os.path.join(stats_dir, repo_name + ".json"), "w") as commit_fd: commit_fd.write(repo_dump) print(repo_name + " done")
class GitSync(object): def __init__(self, database): logging.info("Initializing GitSync.") self.database = database self.charts = dict() self.url = DEFAULT_GITREPO try: self.repo = Repo(REPO_DIRECTORY) except InvalidGitRepositoryError: logging.info("Cloning repository in %s", REPO_DIRECTORY) self.repo = Repo.clone_from(self.url, REPO_DIRECTORY) @coroutine def update_repo(self, document): if document['o']["charts"]["repo_url"] != self.url: logging.info("Repo url updated to '%s'.", document['o']["charts"]["repo_url"]) self.url = document['o']["charts"]["repo_url"] @coroutine def sync_loop(self): logging.info("Initializing sync loop.") yield add_callback("Settings", self.update_repo) synced_head = None settings = yield Query(self.database, "Settings").find_one() or dict() if settings["charts"]: self.url = settings["charts"]["repo_url"] while True: try: if self.url != self.repo.remotes.origin.url: logging.info("Changing reference from '%s' to '%s'.", self.repo.remotes.origin.url, self.url) self.repo.delete_remote('origin') self.repo.create_remote('origin', url=self.url) synced_head = None self.repo.git.fetch('--all') self.repo.git.reset('--hard', 'origin/master') if synced_head != self.repo.head.ref.commit: yield self.sync() synced_head = self.repo.head.ref.commit except: logging.exception("Failed to pull repository.") yield sleep(5) @coroutine def sync(self): logging.info("Syncing '%s'.", REPO_DIRECTORY) charts = yield Query(self.database, "Charts", manipulate=True).find() for chart in charts: path = chart["path"] self.charts[path] = chart discovered_charts = dict() for subdir, _, files in os.walk(REPO_DIRECTORY): for chart_file in files: if chart_file == "Chart.yaml": try: discovered_charts[subdir] = yield self.import_chart( subdir) except Exception: logging.exception("Failed to import chart at '%s'", subdir) for path, existing in self.charts.iteritems(): discovered = discovered_charts.get(path, None) if discovered is None: logging.debug("Deleting chart %(name)s", existing) yield Query(self.database, 'Charts').remove(existing) else: discovered["_id"] = existing["_id"] discovered["metadata"] = existing["metadata"] if discovered["commit"] != existing["commit"]: logging.debug("Updating existing chart %(name)s", discovered) yield Query(self.database, "Charts", manipulate=True).update(discovered) for path, discovered in discovered_charts.iteritems(): if discovered and "_id" not in discovered: logging.debug("Inserting new chart %(name)s", discovered) try: yield Query(self.database, "Charts", manipulate=True).insert(discovered) except: logging.error("Failed to insert chart %(name)s", discovered) self.charts = discovered_charts @coroutine def import_chart(self, directory): chart_path = os.path.join(directory, "Chart.yaml") with open(chart_path, "r") as stream: chart = load(stream) chart["path"] = directory commit = self.repo.iter_commits(paths=chart_path).next() chart["commit"] = binascii.hexlify(commit.binsha) chart["committed_date"] = commit.committed_date chart["resources"] = [] manifests = yield self.import_manifests(directory) for _, manifest in manifests.iteritems(): if commit.committed_date < manifest["commit"].committed_date: chart["commit"] = binascii.hexlify( manifest["commit"].binsha) chart["committed_date"] = manifest["commit"].committed_date for resource in manifest["resources"]: chart["resources"].append(resource) raise Return(chart) @coroutine def import_manifests(self, directory): manifests = dict() manifests_path = os.path.join(directory, "manifests", "*.yaml") for manifest in glob.glob(manifests_path): with open(manifest, "r") as stream: manifests[manifest] = dict( resources=[resource for resource in load_all(stream)], commit=self.repo.iter_commits(paths=manifest).next()) manifests_path = os.path.join(directory, "templates", "*.yaml") for manifest in glob.glob(manifests_path): manifest_filename = ntpath.basename(manifest) rendered_manifest = check_output([ "tide", "view", "-f", "templates/" + manifest_filename, directory ]) with io.TextIOWrapper(io.BytesIO(rendered_manifest)) as stream: manifests[manifest] = dict( resources=[resource for resource in load_all(stream)], commit=self.repo.iter_commits(paths=manifest).next()) raise Return(manifests)
class DLLearnerRepo(Iterator): """ TODO: - add progress bar for repo download - find out automatically whether the repo is already cloned and in case it is, just run a pull (after having set the branch) """ _commits_get_url = 'https://api.github.com/repos/AKSW/DL-Learner/commits' +\ '?per_page=10000&since=%s&sha=%s' _github_repo_url = 'https://github.com/AKSW/DL-Learner.git' def __init__(self, repo_dir_path, since=None, branch='develop', already_cloned=False): # strip off trailing directory separator if repo_dir_path.endswith(os.path.sep): self.repo_dir_path = repo_dir_path[:-1] else: self.repo_dir_path = repo_dir_path if since is None: self.since = datetime.now() - timedelta(default_time_delta_in_days) else: self.since = since self.branch = branch self.commit_sha1s = None self.next_idx = None self._setup_repo(already_cloned) def __len__(self): if self.commit_sha1s is None: self._init_commit_sha1s() return len(self.commit_sha1s) def _setup_repo(self, already_cloned): if already_cloned: self._git_repo = Repo(self.repo_dir_path) else: _log.info('Cloning repo from %s into %s' % ( self._github_repo_url, self.repo_dir_path)) self._git_repo = Repo.clone_from( self._github_repo_url, self.repo_dir_path) _log.info('-Done-') if self.branch: self._git_repo.git.checkout(self.branch) # self._git_repo.active_branch = self.branch def __iter__(self): return self def __next__(self): if self.commit_sha1s is None: self._init_commit_sha1s() self.next_idx += 1 if self.next_idx >= len(self.commit_sha1s): raise StopIteration return DLLearnerCommit(self.commit_sha1s[self.next_idx], self) def _init_commit_sha1s(self): commit_sha1s = [] for c in self._git_repo.iter_commits(): """Iters the commits backwards in time, i.e. the latest commit comes first and the oldest comes last """ c_date = datetime.fromtimestamp(c.committed_date) if c_date < self.since: break commit_sha1s.append(c.hexsha) commit_sha1s.reverse() self.commit_sha1s = \ [c for c in commit_sha1s if c not in commits_to_skip] self.next_idx = -1 def get_checkout_cmd(self): return self._git_repo.git.checkout
def getEventsOf(owner, repo_name): fix_commits = [] # res = requests.get("https://api.github.com/repos/{}/{}/events".format(owner, repo_name)) referencePairs = {} after = "" id = 0 evt_cnt = 0 while True: request = requests.post( 'https://api.github.com/graphql', json={'query': getQuery(owner, repo_name, after)}, headers={"Authorization": "Bearer %s" % token}) result = request.json() # result = json.dumps(result['data'], indent=2) # with open("repo_name_" + str(id), "w") as fd: # fd.write(result) # break if 'errors' in result: print(result) return 1 search = result['data']['search'] edges = search['edges'] for event in edges: if len(event['node']) == 0: continue commits = event['node']['timelineItems']['nodes'] if commits != []: for commit in commits: commit_ = commit['commit'] if commit_ is None: continue if 'fix' in commit_['message']: fix_commits.append(commit_['oid']) referencePairs[event['node']['number']] = { "creationdate": event['node']['createdAt'].replace('T', " ").replace( 'Z', " +0000"), "resolutiondate": event['node']['closedAt'].replace('T', " ").replace( 'Z', " +0000"), "hash": commit_['oid'], "commitdate": commit_['committedDate'].replace('T', " ").replace( 'Z', " +0000"), } evt_cnt += 1 # print("Issue count:", search['issueCount'], "number of edges:", # len(search['edges'])) # print("PageInfo:", search['pageInfo'], "cursor:", # search['edges'][-1]['cursor'], "\n") if not search['pageInfo']['hasNextPage']: print("{}/{} done with {} commits, issue count {}".format( owner, repo_name, evt_cnt, search['issueCount'])) break after = 'after: "%s"' % search['edges'][-1]['cursor'] time.sleep(1) id += 1 repo = Repo(os.path.join(work_dir, owner + '-' + repo_name)) commits = list(repo.iter_commits()) for idx, commit in enumerate(commits): if 'fix' in commit.message and str(commit) not in fix_commits: res = re.search(r'[+-]\d{2}:', str(commit.committed_datetime)) referencePairs[str(commit)] = { "creationdate": str(commit.committed_datetime)[:res.start()] + " +0000", "resolutiondate": str(commit.committed_datetime)[:res.start()] + " +0000", "hash": str(commit), "commitdate": str(commit.committed_datetime)[:res.start()] + " +0000", } evt_cnt += 1 print("{}/{} find {} fix commits in total".format(owner, repo_name, evt_cnt)) if evt_cnt != 0: with open(os.path.join(res_dir, repo_name + "_issue.json"), "w") as fd: fd.write(json.dumps(referencePairs, indent=2)) with open("evet_list", "a") as list_fd: list_fd.write("{} {}\n".format(owner, repo_name)) return evt_cnt
def check(repo_obj): global row global commit_cnt global changed_files_cnt global all_commit_cnt global all_changed_cnt repo_name = repo_obj["owner"]["login"] + "-" + repo_obj["name"] # print("="*8, "start to check repo ", repo_name, "="*8) commit_total = 0 commit_cross = 0 percent = 0 shas = set() repo = Repo(os.path.join(work_dir, repo_name)) for b in repo.remote().fetch(): if '/' not in b.name: continue print("start to check branch {} of {}".format(b.name, repo_name)) branch_name = b.name.split('/')[1] repo.git.checkout('-B', branch_name, b.name) commits = list(repo.iter_commits()) for idx, commit in enumerate(commits): if str(commit) in shas: continue else: shas.add(str(commit)) if len(commit.parents) > 1: continue commit_total += 1 xml_cnt = 0 kot_jav_cnt = 0 all_commit_cnt += 1 # if idx == len(commits)-1: file_list = list(commit.stats.files) all_changed_cnt += len(file_list) for file in file_list: if len(file) > 4 and file[-4:] == '.xml': xml_cnt += 1 elif len(file) > 3 and file[-3:] == '.kt' or len( file) > 5 and file[-5:] == '.java': kot_jav_cnt += 1 if xml_cnt >= 1 and kot_jav_cnt >= 1: commit_cnt += 1 changed_files_cnt += len(file_list) commit_cross += 1 # break # diff_index = commit.diff(commits[idx+1]) # for diff_item in diff_index: # if diff_item.a_path[-4:] == '.xml': # xml_cnt += 1 # elif diff_item.a_path[-3:] == '.kt' or diff_item.a_path[-5:] == '.java': # kot_jav_cnt += 1 # if xml_cnt >= 1 and kot_jav_cnt >= 1: # commit_cross += 1 percent = float(commit_cross) / commit_total repo_name_full = repo_obj["full_name"] res = "{} Total: {}, Cross: {}, Percent: {}".format( repo_name_full, commit_total, commit_cross, percent) print(res) percent_coll.append((repo_obj["full_name"], percent * 100)) # print("="*8, "check repo ", repo_name, "completed", "="*8) return commit_total, commit_cross, percent
class GitSync(object): def __init__(self, database): logging.info("Initializing GitSync.") self.database = database self.charts = dict() self.url = DEFAULT_GITREPO try: self.repo = Repo(REPO_DIRECTORY) except InvalidGitRepositoryError: logging.info("Cloning repository in %s", REPO_DIRECTORY) self.repo = Repo.clone_from(self.url, REPO_DIRECTORY) @coroutine def update_repo(self, document): if document['o']["charts"]["repo_url"] != self.url: logging.info("Repo url updated to '%s'.", document['o']["charts"]["repo_url"]) self.url = document['o']["charts"]["repo_url"] @coroutine def sync_loop(self): logging.info("Initializing sync loop.") yield add_callback("Settings", self.update_repo) synced_head = None settings = yield Query(self.database, "Settings").find_one() or dict() if settings["charts"]: self.url = settings["charts"]["repo_url"] while True: try: if self.url != self.repo.remotes.origin.url: logging.info("Changing reference from '%s' to '%s'.", self.repo.remotes.origin.url, self.url) self.repo.delete_remote('origin') self.repo.create_remote('origin', url=self.url) synced_head = None self.repo.git.fetch('--all') self.repo.git.reset('--hard', 'origin/master') if synced_head != self.repo.head.ref.commit: yield self.sync() synced_head = self.repo.head.ref.commit except: logging.exception("Failed to pull repository.") yield sleep(5) @coroutine def sync(self): logging.info("Syncing '%s'.", REPO_DIRECTORY) charts = yield Query(self.database, "Charts", manipulate=True).find() for chart in charts: path = chart["path"] self.charts[path] = chart discovered_charts = dict() for subdir, _, files in os.walk(REPO_DIRECTORY): for chart_file in files: if chart_file == "Chart.yaml": try: discovered_charts[subdir] = yield self.import_chart(subdir) except Exception: logging.exception("Failed to import chart at '%s'", subdir) for path, existing in self.charts.iteritems(): discovered = discovered_charts.get(path, None) if discovered is None: logging.debug("Deleting chart %(name)s", existing) yield Query(self.database, 'Charts').remove(existing) else: discovered["_id"] = existing["_id"] discovered["metadata"] = existing["metadata"] if discovered["commit"] != existing["commit"]: logging.debug("Updating existing chart %(name)s", discovered) yield Query(self.database, "Charts", manipulate=True).update(discovered) for path, discovered in discovered_charts.iteritems(): if discovered and "_id" not in discovered: logging.debug("Inserting new chart %(name)s", discovered) try: yield Query(self.database, "Charts", manipulate=True).insert(discovered) except: logging.error("Failed to insert chart %(name)s", discovered) self.charts = discovered_charts @coroutine def import_chart(self, directory): chart_path = os.path.join(directory, "Chart.yaml") with open(chart_path, "r") as stream: chart = load(stream) chart["path"] = directory commit = self.repo.iter_commits(paths=chart_path).next() chart["commit"] = binascii.hexlify(commit.binsha) chart["committed_date"] = commit.committed_date chart["resources"] = [] manifests = yield self.import_manifests(directory) for _, manifest in manifests.iteritems(): if commit.committed_date < manifest["commit"].committed_date: chart["commit"] = binascii.hexlify(manifest["commit"].binsha) chart["committed_date"] = manifest["commit"].committed_date for resource in manifest["resources"]: chart["resources"].append(resource) raise Return(chart) @coroutine def import_manifests(self, directory): manifests = dict() manifests_path = os.path.join(directory, "manifests", "*.yaml") for manifest in glob.glob(manifests_path): with open(manifest, "r") as stream: manifests[manifest] = dict( resources=[resource for resource in load_all(stream)], commit=self.repo.iter_commits(paths=manifest).next() ) manifests_path = os.path.join(directory, "templates", "*.yaml") for manifest in glob.glob(manifests_path): manifest_filename = ntpath.basename(manifest) rendered_manifest = check_output(["tide", "view", "-f", "templates/" + manifest_filename, directory]) with io.TextIOWrapper(io.BytesIO(rendered_manifest)) as stream: manifests[manifest] = dict( resources=[resource for resource in load_all(stream)], commit=self.repo.iter_commits(paths=manifest).next() ) raise Return(manifests)
def check(repo_name): # global commit_cnt # global changed_files_cnt # global all_commit_cnt # global all_changed_cnt global multi_lang global other shas = set() repo = Repo(os.path.join(work_dir, repo_name)) for b in repo.remote().fetch(): if '/' not in b.name: continue print("start to check branch {} of {}".format(b.name, repo_name)) # branch_name = b.name.split('/')[1] # repo.git.checkout('-B', branch_name, b.name) commits = list(repo.iter_commits('remotes/' + b.name)) for idx, commit in enumerate(commits): if str(commit) in shas: continue else: shas.add(str(commit)) if len(commits[idx - 1].parents) > 1: continue if idx == 0: continue xml_cnt = 0 kot_jav_cnt = 0 # if idx == len(commits)-1: file_list = list(commits[idx - 1].stats.files) dir_set = set() for file in file_list: file = file.split(' => ')[-1] dir = file.split('/')[0] if dir == file: dir_set.add('.') else: dir_set.add(dir) if len(file) > 4 and file[-4:] == '.xml': xml_cnt += 1 elif len(file) > 3 and file[-3:] == '.kt' or len( file) > 5 and file[-5:] == '.java': kot_jav_cnt += 1 dir_cnt = len(dir_set) patch = repo.git.diff(commit.tree, commits[idx - 1].tree).split('\n') hunks_cnt = 0 added = 0 deleted = 0 for line in patch: if len(line) >= 2 and line[0] == '@' and line[1] == '@': hunks_cnt += 1 elif len(line) >= 1 and line[0] == '+' and (len(line) < 3 or line[1] != '+' or line[2] != '+'): added += 1 elif len(line) >= 1 and line[0] == '-' and (len(line) < 3 or line[1] != '-' or line[2] != '-'): deleted += 1 commit_type = "" if xml_cnt >= 1 and kot_jav_cnt >= 1: commit_type = "Multi-lang" multi_lang += 1 else: commit_type = "Other" other += 1 with open(os.path.join(csv_dir, metric_type[0] + ".csv"), 'a', newline="") as csv_fd: csv_writer = csv.writer(csv_fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([commit_type, len(file_list)]) if len(file_list) == 0: print("!!" + str(commits[idx - 1])) with open(os.path.join(csv_dir, metric_type[1] + ".csv"), 'a', newline="") as csv_fd: csv_writer = csv.writer(csv_fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([commit_type, hunks_cnt]) with open(os.path.join(csv_dir, metric_type[2] + ".csv"), 'a', newline="") as csv_fd: csv_writer = csv.writer(csv_fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([commit_type, added]) with open(os.path.join(csv_dir, metric_type[3] + ".csv"), 'a', newline="") as csv_fd: csv_writer = csv.writer(csv_fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([commit_type, deleted]) with open(os.path.join(csv_dir, metric_type[4] + ".csv"), 'a', newline="") as csv_fd: csv_writer = csv.writer(csv_fd, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([commit_type, dir_cnt]) # break # diff_index = commit.diff(commits[idx+1]) # for diff_item in diff_index: # if diff_item.a_path[-4:] == '.xml': # xml_cnt += 1 # elif diff_item.a_path[-3:] == '.kt' or diff_item.a_path[-5:] == '.java': # kot_jav_cnt += 1 # if xml_cnt >= 1 and kot_jav_cnt >= 1: # commit_cross += 1 print("repo {} done".format(repo_name))
def getQRCommits(repo_name): global QRCommits_cnt repo_cnt = 0 commits_list = [] shas = set() repo = Repo(os.path.join(work_dir, repo_name)) for b in repo.remote().fetch(): if '/' not in b.name: continue print("start to check branch {} of {}".format(b.name, repo_name)) branch_name = b.name.split('/')[1] repo.git.checkout('-B', branch_name, b.name) commits = list(repo.iter_commits(branch_name)) for idx, commit in enumerate(commits): if str(commit) in shas: continue else: shas.add(str(commit)) commit_collection = {} xml_cnt = 0 kot_jav_cnt = 0 file_list = list(commit.stats.files) for file in file_list: if len(file) > 4 and file[-4:] == '.xml': xml_cnt += 1 elif len(file) > 3 and file[-3:] == '.kt' or len( file) > 5 and file[-5:] == '.java': kot_jav_cnt += 1 if xml_cnt >= 1 and kot_jav_cnt >= 1: if idx != 0: valid = False for f in list(commits[idx - 1].stats.files): if '.xml' in f or '.kt' in f or '.java' in f: valid = True break if not valid: continue time_next_commit = commits[ idx - 1].committed_datetime - commit.committed_datetime if time_next_commit.total_seconds() / 60 <= 30 and len( commit.parents) < 2 and len( commits[idx - 1].parents) < 2: QRCommits_cnt += 1 repo_cnt += 1 commit_collection["commit_id"] = str(commit) commit_collection["commit_time"] = str( commit.committed_datetime) commit_collection["commit_msg"] = str.strip( commit.message) commit_collection["remedy_id"] = str(commits[idx - 1]) commit_collection["remedy_time"] = str( commits[idx - 1].committed_datetime) commit_collection["remedy_msg"] = str.strip( commits[idx - 1].message) commits_list.append(commit_collection) # diff_files = [] # if not commit.parents: # continue # else: # for diff in commit.diff(commit.parents[0]): # diff_file = {} # diff_file["file_path"] = diff.a_path # diff_file["change_type"] = diff.change_type # diff_file["lang"] = os.path.splitext(diff.a_path)[1][1:] # diff_files.append(diff_file) # commit_collection["diff_files"] = diff_files # commit_collection["parent_commit_num"] = len(commit.parents) repo_dump = json.dumps(commits_list, indent=2, ensure_ascii=False) with open(os.path.join(stats_dir, repo_name + "_" + str(repo_cnt)), "w") as commit_fd: commit_fd.write(repo_dump) print(repo_name + " done with {} commits".format(repo_cnt))
def computeRepo(repo_name): global issue_cnt global chinese_mst_cnt shas = set() print(8 * '=', "start to handle repo ", repo_name, 8 * '=') repo = Repo(os.path.join(repo_dir, repo_name)) for b in repo.remote().fetch(): if '/' not in b.name: continue print("start to check branch {} of {}".format(b.name, repo_name)) branch_name = b.name.split('/')[1] repo.git.checkout('-B', branch_name, b.name) commits = list(repo.iter_commits(branch_name)) for idx, commit in enumerate(commits): if str(commit) in shas: continue else: shas.add(str(commit)) xml_cnt = 0 kot_jav_cnt = 0 for file in list(commit.stats.files): if len(file) > 4 and file[-4:] == '.xml': xml_cnt += 1 elif len(file) > 3 and file[-3:] == '.kt' or len( file) > 5 and file[-5:] == '.java': kot_jav_cnt += 1 if xml_cnt >= 1 and kot_jav_cnt >= 1: msg = commit.message # msg = str.strip( # translator.translate(msg, # lang_tgt='en')).lower() # print(msg) if reg_issue.search(msg) and len(commit.parents) < 2: msg = "fix " + msg issue_cnt += 1 if (check_contain_chinese(msg)): chinese_mst_cnt += 1 seg_list = jieba.cut(msg, cut_all=False) cn = True else: seg_list = wordninja.split(msg.lower()) cn = False for seg in seg_list: if cn is True and check_contain_chinese(seg) is False: seg_split_again = wordninja.split(seg) for _seg in seg_split_again: if len( _seg ) > 1 and _seg != '\t' and _seg != '\n' and _seg != '\r\n' and _seg not in stop_words: if _seg in word_bags: word_bags[_seg] += 1 else: word_bags[_seg] = 1 elif cn is True: if len( seg ) > 1 and seg != '\t' and seg != '\n' and seg != '\r\n' and seg not in stop_words: if seg in word_bags: word_bags[seg] += 1 else: word_bags[seg] = 1 else: if len( seg ) > 1 and seg != '\t' and seg != '\n' and seg != '\r\n' and seg not in stop_words: if seg in word_bags: word_bags[seg] += 1 else: word_bags[seg] = 1 print(8 * '=', "repo ", repo_name, "done", 8 * '=')
class GitArchiver(BaseArchiver): """Gitpython implementation of the base archiver.""" name = "git" def __init__(self, config): """ Instantiate a new Git Archiver. :param config: The wily configuration :type config: :class:`wily.config.WilyConfig` """ try: self.repo = Repo(config.path) except git.exc.InvalidGitRepositoryError as e: raise InvalidGitRepositoryError from e self.config = config if self.repo.head.is_detached: self.current_branch = self.repo.head.object.hexsha else: self.current_branch = self.repo.active_branch assert not self.repo.bare, "Not a Git repository" def revisions(self, path: str, max_revisions: int) -> List[Revision]: """ Get the list of revisions. :param path: the path to target. :type path: ``str`` :param max_revisions: the maximum number of revisions. :type max_revisions: ``int`` :return: A list of revisions. :rtype: ``list`` of :class:`Revision` """ if self.repo.is_dirty(): raise DirtyGitRepositoryError(self.repo.untracked_files) revisions = [] for commit in self.repo.iter_commits(self.current_branch, max_count=max_revisions, reverse=True): tracked_files, tracked_dirs = get_tracked_files_dirs( self.repo, commit) if not commit.parents or not revisions: added_files = tracked_files modified_files = [] deleted_files = [] else: added_files, modified_files, deleted_files = whatchanged( commit, self.repo.commit(commit.hexsha + "~1")) logger.debug( f"For revision {commit.name_rev.split(' ')[0]} found:") logger.debug(f"Tracked files: {tracked_files}") logger.debug(f"Tracked directories: {tracked_dirs}") logger.debug(f"Added files: {added_files}") logger.debug(f"Modified files: {modified_files}") logger.debug(f"Deleted files: {deleted_files}") rev = Revision( key=commit.name_rev.split(" ")[0], author_name=commit.author.name, author_email=commit.author.email, date=commit.committed_date, message=commit.message, tracked_files=tracked_files, tracked_dirs=tracked_dirs, added_files=added_files, modified_files=modified_files, deleted_files=deleted_files, ) revisions.append(rev) return revisions[::-1] def checkout(self, revision: Revision, options: Dict): """ Checkout a specific revision. :param revision: The revision identifier. :type revision: :class:`Revision` :param options: Any additional options. :type options: ``dict`` """ rev = revision.key self.repo.git.checkout(rev) def finish(self): """ Clean up any state if processing completed/failed. For git, will checkout HEAD on the original branch when finishing """ self.repo.git.checkout(self.current_branch) self.repo.close() def find(self, search: str) -> Revision: """ Search a string and return a single revision. :param search: The search term. :type search: ``str`` :return: An instance of revision. :rtype: Instance of :class:`Revision` """ commit = self.repo.commit(search) tracked_files, tracked_dirs = get_tracked_files_dirs(self.repo, commit) if not commit.parents: added_files = tracked_files modified_files = [] deleted_files = [] else: added_files, modified_files, deleted_files = whatchanged( commit, self.repo.commit(commit.hexsha + "~1")) return Revision( key=commit.name_rev.split(" ")[0], author_name=commit.author.name, author_email=commit.author.email, date=commit.committed_date, message=commit.message, tracked_files=tracked_files, tracked_dirs=tracked_dirs, added_files=added_files, modified_files=modified_files, deleted_files=deleted_files, )
class RepoExporter: def __init__(self, source_repo_path, source_repo_branch, target_repo_path, target_repo_relative_directory, target_repo_commit_branch): self.source_repo_path = source_repo_path self.source_repo_branch = source_repo_branch self.target_repo_path = target_repo_path self.target_repo_commit_branch = target_repo_commit_branch # Instantiate repo objects self.source_repo = Repo(self.source_repo_path) self.target_repo = Repo(self.target_repo_path) # saving the head of the source repo self.source_head = self.source_repo.head.name # extract repo names self.source_repo_name = get_repo_name(self.source_repo) self.target_repo_name = get_repo_name(self.target_repo) # export paths self.target_repo_archive_path_root = "{}/{}".format(COMMIT_EXPORT_PATH, self.source_repo_name) self.target_repo_archive_path_tar = "{}/{}".format(self.target_repo_archive_path_root, "commit_archive.tar") self.target_repo_directory = "{}/{}".format(self.target_repo_path, target_repo_relative_directory) # ingores to add at each commit self.ignores_to_add = extract_gitinore(self.source_repo_path) # initialize commit infos self._exact_commits() def _clean_tmp(self): rmtree(COMMIT_EXPORT_PATH, ignore_errors=True) def _prepare_export(self): # clean the previous export data, recreate the forders rmtree(COMMIT_EXPORT_PATH, ignore_errors=True) rmtree(self.target_repo_directory, ignore_errors=True) makedirs(self.target_repo_archive_path_root, exist_ok=True) #makedirs(self.target_repo_directory, exist_ok=True) def _exact_commits(self): # going on the source branch self.source_repo.git.checkout(self.source_repo_branch) commits_infos = [] for commit in self.source_repo.iter_commits(): commit_id = commit.hexsha commit_infos = { # "author": commit.author, "message": commit.message, "date": datetime.fromtimestamp(commit.authored_date).isoformat() } print("found_commit --> ", commit_infos) commits_infos.append((commit_id, commit_infos)) self.commits_infos = list(reversed(commits_infos)) # Returning on the defaut HEAD self.source_repo.git.checkout(self.source_head) def _add_ignores(self): gitignore_path = join(self.source_repo_path, ".gitignore") if self.ignores_to_add is not None: with open(gitignore_path, "a") as gitignore_file: gitignore_file.write(self.ignores_to_add) def extract_source_files(self, commit_id: str): # clean the export tmp directory self._prepare_export() self.source_repo.git.checkout(commit_id, force=True) # get all the files of the commit with open(self.target_repo_archive_path_tar, "wb+") as export_archive_io: self.source_repo.archive(export_archive_io) # extract all the source files of the commit and remove the archive tar archive_tar = tarfile.open(self.target_repo_archive_path_tar) archive_tar.extractall(self.target_repo_archive_path_root) unlink(self.target_repo_archive_path_tar) def move_source_to_target(self): # copy the commit source files to the target repo copytree(self.target_repo_archive_path_root, self.target_repo_directory) # Add problematic files to gitignore def commit_to_target(self, commit_infos: dict): # tries to checkout to branch, creates the branch if it doesn't exists try: self.target_repo.git.checkout(b=self.target_repo_commit_branch) except Exception: self.target_repo.git.checkout(self.target_repo_commit_branch) self.target_repo.git.stage(".") self.target_repo.git.commit(**commit_infos) def transfer_commits(self): for commit_id, commit_infos in self.commits_infos: print("transfering commit --> ", commit_infos["message"]) # faking commit date ;) environ["GIT_AUTHOR_DATE"] = commit_infos["date"] environ["GIT_COMMITTER_DATE"] = commit_infos["date"] self.extract_source_files(commit_id) self.move_source_to_target() self.commit_to_target(commit_infos) def __del__(self): self._clean_tmp()
if sys.argv[1] == "transform": args = parse_transform() with open(".replay.json", "r") as f: name = json.load(f)["NAME"] dst = Path.home() / ".flor" / name / "repo.git" if dst.exists(): shutil.rmtree(dst) transformed = Path.home() / ".flor" / name / "transformed" if not transformed.exists(): transformed.mkdir() r = Repo() assert "flor.shadow" in str(r.active_branch) r.clone(dst) r = Repo(dst) commits = [ c for c in r.iter_commits() if "flor.shadow" in c.message and ".json" == c.message[-len(".json"):] ] root = args.source.absolute() cwd = os.getcwd() os.chdir(dst) active = r.active_branch for version in commits: r.git.checkout(version) n = transformed / (str(version.hexsha) + "." + str(args.source)) try: backprop(None, root, args.source, open(n, "w")) # type: ignore print( f'transformed {(str(version.hexsha) + "::" + str(args.source))}' ) except FileNotFoundError: