def stackoverflow(framework, projects): global api api = StackAPI("stackoverflow") samples = get_samples(projects) output_write(framework, directory, "questions_and_answers", get_header(), True) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) questions = get_questions_when_body_has(sample) for indx, question in enumerate(questions["items"]): print("{0}% questions analysed of {1}".format( (indx+1)/len(questions)*100, sample)) try: answer = api.fetch("answers/{ids}", ids=[question["accepted_answer_id"]])["items"][0] answer_owner = get_owner_by_user_id(api, answer["owner"]["user_id"]) except KeyError: answer = { "answer_id": "", "score": "", "creation_date": "" } answer_owner = { "user_id": "", "reputation": "", "creation_date": "", "tags": [] } question_owner = get_owner_by_user_id(api, question["owner"]["user_id"]) output = create_output(framework, sample, question, answer, question_owner, answer_owner) output_write(framework, directory, "questions_and_answers", output, False)
def delay(framework, projects, githubtoken): print("Computing delay to update") path_dos_repositorios = 'repositories' measure = "delay" output_write(framework, measure, measure, "framework,path,current_version,next_version,framework_release_date (YYYY-DD-MM),sample_update_date (YYYY-DD-MM) ,delay_in_days", True) framework_release_data = buscar_dados_de_lancamento_de_versoes(framework, githubtoken) configuration_file = define_arquivo_de_configuracao(framework) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) sample_path = path_dos_repositorios + "/" + sample paths_configuration_file = find_paths(configuration_file, sample_path) repository = Repo(sample_path) reversed_commits = get_commits(repository) for path in paths_configuration_file: current_version, reversed_commits = get_first_version(framework, path, repository, reversed_commits) if current_version == {}: continue for commit in reversed_commits: repository.git.checkout(commit, '-f') next_version = buscar_versao_do_framework(framework, path) if current_version != next_version and next_version != '' and current_version != '' and current_version != None and next_version != None: sample_update_date = get_commit_date(commit) framework_release_date = framework_release_data[next_version] delay_in_days = calculate_delay(framework_release_date, sample_update_date) output_write(framework, measure, measure, create_output(current_version, delay_in_days, framework, framework_release_date, next_version, path, sample_update_date), False) current_version = next_version repository.git.checkout('master', '-f')
def allanswers(framework, projects): global api api = StackAPI("stackoverflow") samples = get_samples(projects) output_write(framework, directory, "all_answers", get_header(), True) with open("stackoverflow/" + framework + "_questions_and_answers_output.csv") as questions: for index, question in enumerate(questions): if index == 0: continue print("Questions from sample " + question.split(",")[1]) question = question.replace("\n", "") question_id = question.split(",")[2] answers = api.fetch("questions/" + question_id + "/answers")["items"] print(len(answers)) for indx, answer in enumerate(answers): print("{0}% answers analysed of question {1}".format( (indx + 1) / len(answers) * 100, question_id)) try: answer_owner = get_owner_by_user_id( api, answer["owner"]["user_id"]) except KeyError: answer_owner = { "user_id": "", "reputation": "", "creation_date": "", "tags": [] } output = create_output(framework, question.split(",")[1], question_id, answer, answer_owner) output_write(framework, directory, "all_answers", output, False)
def output_callback(path, filter_result): path = path.strip() path = path[brick_path_len + 1:] output_write(fout, path, args.output_prefix, encode=(not args.no_encode), tag=args.tag, field_separator=args.field_separator)
def githubmetadata(framework, projects, githubtoken): print("Computing github metadata") measure = "githubmetadata" output_write(framework, measure, measure, "framework,repository,forks,stargazers,watchers,openedIssues,closedIssues,commits,openedPullRequests,closedPullRequests,updatedAt,projects,lifetime,lifetime per commit", True) g = Github(githubtoken) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) repo = g.get_repo(sample) output = create_output(framework, repo, sample) output_write(framework, measure, measure, output, False)
def write_header(action_in_files, configuration_files, extension_files, framework, measure): output = "framework,path" for action in action_in_files: output += "," + action output += ",total_actions" for file in extension_files: output += "," + file for file in configuration_files: output += "," + file output_write(framework, "file_extension_changes", measure, output, True)
def count_forks_ahead(framework, forks, repository): forks_ahead = 0 for fork in forks: manage_limit_rate(forks.totalCount) try: comparation = repository.compare(repository.default_branch, fork.owner.login + ":" + fork.default_branch) if comparation.ahead_by > 0: output_write(framework, "forksahead", "forks_ahead", framework+","+fork.full_name+","+str(comparation.ahead_by), False) forks_ahead = forks_ahead + 1 except: continue return forks_ahead
def write_content(action_in_files, configuration_files, extension_files, framework, sample, measure): output = framework + "," + sample actions_count = 0 for action in action_in_files: output += "," + str(action_in_files[action]) actions_count += action_in_files[action] output += "," + str(actions_count) for file in extension_files: output += "," + str(extension_files[file]) for file in configuration_files: output += "," + str(configuration_files[file]) output_write(framework, "file_extension_changes", measure, output, False)
def generalprojects(projects): samples = get_samples(projects) output_write("", "generalprojects", "projects", "path,stars,language,framework", False) for repository in samples: clone(repository) print("{0} baixado".format(repository)) framework = get_framework(repository) print("{0} classificado como {1}".format(repository, framework)) shutil.rmtree("generalprojects/repositories/" + repository.split("/")[0]) print("{0} apagado".format(repository)) output_write("", "generalprojects", "projects", "{0},{1}".format(repository, framework), False)
def output_callback(path, filter_result, is_dir): path = path.strip() path = path[brick_path_len+1:] if args.type == "both": output_write(fout, path, args.output_prefix, encode=(not args.no_encode), tag=args.tag, field_separator=args.field_separator) else: if (is_dir and args.type == "d") or ( (not is_dir) and args.type == "f"): output_write(fout, path, args.output_prefix, encode=(not args.no_encode), tag=args.tag, field_separator=args.field_separator)
def importcount(framework, projects): print("Computing imports") measure = "importcount" output_write(framework, measure, measure, "framework,path,imports,javaFiles,imports/java_files", True) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) deal_with_empty_repo(sample) java_files_path = find_paths("*.java", "repositories/" + sample) imports = get_imports(framework, java_files_path) relative = calculate_relative(imports, java_files_path) output_write( framework, measure, measure, create_output(framework, imports, java_files_path, relative, sample), False)
def numberofextensionfile(framework, projects): print("Computing extension files") extensions = create_extension_files() measure = "numberofextensionfile" output_write( framework, measure, measure, 'framework,project,java,properties,jar,build.gradle,pom.xml,manifest.xml,xml,bat,md,adoc,README,yaml,txt,sh,travis.yml,yml,cmd,kt,json,numberOfFiles,others', True) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) deal_with_empty_repo(sample) count_extension_files(extensions, sample) others = count_others(extensions) output = concat_output(extensions) + str(others) output_write(framework, measure, measure, framework + "," + sample + "," + output, False)
def currentframeworkversion(framework, projects): print("Computing current framework version") configuration_file = find_config_file(framework) configuration_file_key_words = get_key_words(framework) write_output_header(configuration_file_key_words, framework) samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index+1, len(samples)) checkout_default_branch_repository(sample) deal_with_empty_repo(sample) configuration_files_paths = find_paths(configuration_file, "repositories/" + sample) for path in configuration_files_paths: output = framework + "," + path for key, value in configuration_file_key_words.items(): version = get_framework_version(framework, path, key) output = output + "," + version if ",,," not in output and (framework != "spring" or "RELEASE" in output): output_write(framework, "currentframeworkversion", "currentframeworkversion", output, False)
def understandmetrics(framework, projects): samples = get_samples(projects) owner = samples[0].split("/")[0] create_output_directory("understandmetrics", owner) output_write( framework, "understandmetrics", "understandmetrics", "framework,projeto,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountClassBase,CountClassCoupled,CountClassCoupledModified,CountClassDerived,CountDeclClass,CountDeclClassMethod,CountDeclClassVariable,CountDeclExecutableUnit,CountDeclFile,CountDeclFunction,CountDeclInstanceMethod,CountDeclInstanceVariable,CountDeclMethod,CountDeclMethodAll,CountDeclMethodDefault,CountDeclMethodPrivate,CountDeclMethodProtected,CountDeclMethodPublic,CountInput,CountLine,CountLineBlank,CountLineCode,CountLineCodeDecl,CountLineCodeExe,CountLineComment,CountOutput,CountPath,CountPathLog,CountSemicolon,CountStmt,CountStmtDecl,CountStmtExe,Cyclomatic,CyclomaticModified,CyclomaticStrict,Essential,Knots,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,MaxEssential,MaxEssentialKnots,MaxInheritanceTree,MaxNesting,MinEssentialKnots,PercentLackOfCohesion,PercentLackOfCohesionModified,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential,?,numberOfJavaFiles", True) for sample in samples: repositories_path = "/home/gabriel/Documentos/gabrielsmenezes/pesquisamestrado/repositories/" sample_path = repositories_path + sample udb_path = "understandmetrics/" + sample deal_with_empty_repo(sample) metrics = get_understand_metrics(framework, sample, udb_path, sample_path) output = create_output(metrics) output_write(framework, "understandmetrics", "understandmetrics", output, False)
def maintainers(framework, projects, githubtoken): print("Computing maintainers data") output_write( framework, "maintainers", "maintainers", "framework,path,framework_contributors,sample_contributors,commom_contributors,commom/framework,commom/sample", True) framework_repository = get_repository_name(framework) framework_contributors = get_contributors(framework_repository, githubtoken) framework_contributors.totalCount samples = get_samples(projects) for index, sample in enumerate(samples): print_status_samples(index + 1, len(samples)) sample_contributors = get_contributors(sample, githubtoken) commmom_contributors = get_commom_contributors(framework_contributors, sample_contributors) output_write( framework, "maintainers", "maintainers", create_output(framework, sample, framework_contributors, sample_contributors, commmom_contributors), False)
def forksahead(framework, projects, githubtoken): print("Computing forks ahead data") g = Github(githubtoken) output_write(framework, "forksahead", "forks_ahead_by_projects", "framework,path,number_of_forks,forks_ahead,ratio", True) output_write(framework, "forksahead", "forks_ahead", "framework,path,number_of_forks,forks_ahead,ratio", True) samples = get_samples(projects) for index, sample in enumerate(samples): manage_limit_rate(len(samples)) print_status_samples(index+1, len(samples)) repository = g.get_repo(sample) forks = repository.get_forks() forks_ahead = count_forks_ahead(framework, forks, repository) number_of_forks = repository.forks_count ratio_forks_ahead = forks_ahead / number_of_forks output = create_output(sample, framework, number_of_forks, forks_ahead, ratio_forks_ahead) output_write(framework, "forksahead", "forks_ahead_by_projects", output, False)
def write_output_header(configuration_file_key_words, framework): header = "framework,path" for config in configuration_file_key_words: header = header + "," + config output_write(framework, "currentframeworkversion", "currentframeworkversion", header, True)
def output_callback(path, filter_result): path = path.strip() path = path[brick_path_len+1:] output_write(fout, path, args.output_prefix, encode=True)
def output_callback(path, filter_result): path = path.strip() path = path[brick_path_len+1:] output_write(fout, path, args.output_prefix, encode=(not args.no_encode), tag=args.tag, field_separator=args.field_separator)
def output_callback(path, filter_result): path = path.strip() path = path[brick_path_len + 1:] output_write(fout, path, args.output_prefix, encode=True)
def gfid_to_path_using_pgfid(brick, gfids_file, output_file, outfile_failures): """ Parent GFID is saved as xattr, collect Parent GFIDs from all the files from gfids_file. Convert parent GFID to path and Crawl each directories to get the list of files/dirs having same inode number. Do find with maxdepth as 1 and print the output in <INODE_NUM> <PATH> format, use this output to look into in memory dictionary of inode numbers got from the list of GFIDs """ with open(output_file, "a+") as fout: pgfids = set() inode_dict = {} with open(gfids_file) as f: for gfid in f: gfid = gfid.strip() p = os.path.join(brick, ".glusterfs", gfid[0:2], gfid[2:4], gfid) if os.path.islink(p): path = symlink_gfid_to_path(brick, gfid) output_write(fout, path, args.output_prefix) else: try: inode_dict[str(os.stat(p).st_ino)] = 1 file_xattrs = xattr.list(p) num_parent_gfid = 0 for x in file_xattrs: if x.startswith("trusted.pgfid."): num_parent_gfid += 1 pgfids.add(x.split(".")[-1]) if num_parent_gfid == 0: with open(outfile_failures, "a") as f: f.write("%s\n" % gfid) f.flush() os.fsync(f.fileno()) except (IOError, OSError) as e: if e.errno == ENOENT: continue else: fail("%s Failed to convert to path from " "GFID %s: %s" % (brick, gfid, e), logger=logger) if not inode_dict: return def inode_filter(path): try: st = os.lstat(path) except (OSError, IOError) as e: if e.errno == ENOENT: st = None else: raise if st and inode_dict.get(str(st.st_ino), None): return True return False # Length of brick path, to remove from output path brick_path_len = len(brick) def output_callback(path): path = path.strip() path = path[brick_path_len+1:] output_write(fout, path, args.output_prefix) ignore_dirs = [os.path.join(brick, dirname) for dirname in conf.get_opt("brick_ignore_dirs").split(",")] for pgfid in pgfids: path = symlink_gfid_to_path(brick, pgfid) find(os.path.join(brick, path), callback_func=output_callback, filter_func=inode_filter, ignore_dirs=ignore_dirs, subdirs_crawl=False) fout.flush() os.fsync(fout.fileno())
def output_callback(path): path = path.strip() path = path[brick_path_len+1:] output_write(fout, path, args.output_prefix)
def output_callback(path): path = path.strip() path = path[brick_path_len + 1:] output_write(fout, path, args.output_prefix)
def gfid_to_path_using_pgfid(brick, gfids_file, output_file, outfile_failures): """ Parent GFID is saved as xattr, collect Parent GFIDs from all the files from gfids_file. Convert parent GFID to path and Crawl each directories to get the list of files/dirs having same inode number. Do find with maxdepth as 1 and print the output in <INODE_NUM> <PATH> format, use this output to look into in memory dictionary of inode numbers got from the list of GFIDs """ with open(output_file, "a+") as fout: pgfids = set() inode_dict = {} with open(gfids_file) as f: for gfid in f: gfid = gfid.strip() p = os.path.join(brick, ".glusterfs", gfid[0:2], gfid[2:4], gfid) if os.path.islink(p): path = symlink_gfid_to_path(brick, gfid) output_write(fout, path, args.output_prefix) else: try: inode_dict[str(os.stat(p).st_ino)] = 1 file_xattrs = xattr.list(p) num_parent_gfid = 0 for x in file_xattrs: if x.startswith("trusted.pgfid."): num_parent_gfid += 1 pgfids.add(x.split(".")[-1]) if num_parent_gfid == 0: with open(outfile_failures, "a") as f: f.write("%s\n" % gfid) f.flush() os.fsync(f.fileno()) except (IOError, OSError) as e: if e.errno == ENOENT: continue else: fail("%s Failed to convert to path from " "GFID %s: %s" % (brick, gfid, e), logger=logger) if not inode_dict: return def inode_filter(path): try: st = os.lstat(path) except (OSError, IOError) as e: if e.errno == ENOENT: st = None else: raise if st and inode_dict.get(str(st.st_ino), None): return True return False # Length of brick path, to remove from output path brick_path_len = len(brick) def output_callback(path): path = path.strip() path = path[brick_path_len + 1:] output_write(fout, path, args.output_prefix) ignore_dirs = [ os.path.join(brick, dirname) for dirname in conf.get_opt("brick_ignore_dirs").split(",") ] for pgfid in pgfids: path = symlink_gfid_to_path(brick, pgfid) find(os.path.join(brick, path), callback_func=output_callback, filter_func=inode_filter, ignore_dirs=ignore_dirs, subdirs_crawl=False) fout.flush() os.fsync(fout.fileno())