def read_data(path, listfile, word_to_id=None, gen_def_positions=True): if isinstance(listfile, list): python_files = [os.path.join(path, f) for f in listfile] else: with open(listfile, encoding='utf-8') as f: python_files = [ os.path.join(path, x) for x in f.read().splitlines() ] mapping = (lambda x: x) if word_to_id is None else ( lambda x: word_to_id.get(x, oov_id)) data = [] definition_positions = [] identifier_usage = [] for i, filename in enumerate(python_files): try: source, tree = get_source_tree(filename) tokens = tokenize.generate_tokens(StringIO(source).readline) data.append([ (mapping(preprocess(tokenType, tokenVal)), start) for tokenType, tokenVal, start, _, _ in tokens if tokenType != tokenize.COMMENT and not tokenVal.startswith( "'''") and not tokenVal.startswith('"""') and ( tokenType == tokenize.DEDENT or tokenVal != "") ]) if gen_def_positions: walker = astwalker.ASTWalker() walker.walk(tree) definition_positions.append(walker.definition_positions) identifier_usage.append(walker.name_usage) # print('정상.. {}/{}'.format(i, python_files.__len__())) except: print('{}/{}'.format(i, python_files.__len__())) print("Error when tokenizing %s: %s" % (filename, sys.exc_info()[0])) return data, definition_positions, identifier_usage
def count_variables(path): subdirectories = [ os.path.join(path, o) for o in os.listdir(path) if os.path.isdir(os.path.join(path, o)) ] python_files = [(directory, [ y for x in os.walk(directory) for y in iglob(os.path.join(x[0], '*.py')) ]) for directory in subdirectories] max_count = {} max_file = {} syntax_error_count = 0 for pair in python_files: for file in pair[1]: try: source, tree = get_source_tree(file) walker = astwalker.ASTWalker() walker.walk(tree) counters = {} for scope in walker.names: for typename in walker.names[scope]: counters[typename] = counters.get(typename, 0) + len( walker.names[scope][typename]) for typename in counters: if counters[typename] > max_count.get(typename, 0): max_count[typename] = counters[typename] max_file[typename] = file except SyntaxError: syntax_error_count += 1 except: print("Skipping file %s due to %s" % (file, sys.exc_info()[0])) print("%d syntax errors" % syntax_error_count) print(max_count) print(max_file)
def normalise(path): split = os.path.split(path) base = split[0] dirname = split[1] normalised_target_path = os.path.join(base, dirname + "_normalised") processed_file_path = os.path.join(path, "processed.txt") print("Writing normalised files to %s" % normalised_target_path) python_files = [y[len(path)+1:] for x in os.walk(path) for y in iglob(os.path.join(x[0], '*.py'))] # For debugging # python_files = ["debug/test.py"] # python_files = ["web2py/gluon/contrib/memcache/memcache.py"] processed_files = [] initial_processed = 0 syntax_errors = [] filenotfound_errors = [] errors = [] skipped = [] if os.path.exists(processed_file_path): print("Found processed files from previous session, continuing...") with open(processed_file_path) as p: processed_files = p.read().splitlines() initial_processed = len(processed_files) def complete(): write_output(processed_file_path, processed_files) print("Processed files: %d\nSyntax errors: %d\nFile not found errors: %d\nOther errors: %d\nSkipped: %d" % (len(processed_files) - initial_processed, len(syntax_errors), len(filenotfound_errors), len(errors), len(skipped))) for filename in python_files: if filename in processed_files: skipped.append(filename) continue error = False try: input_file = os.path.join(path, filename) normalised_target_file = os.path.join(normalised_target_path, filename) source, tree = get_source_tree(input_file) except SyntaxError: syntax_errors.append(filename) continue except FileNotFoundError: filenotfound_errors.append(filename) continue except KeyboardInterrupt: print("Keyboard interrupt, saving...") complete() sys.exit() except: print("Failed to parse %s due to %s" % (filename, sys.exc_info()[0])) errors.append((filename, sys.exc_info()[0])) continue # AST variable replacement and formatting try: walker = astwalker.ASTWalker() # walker.randomise = False # For debugging walker.walk(tree) walker.process_replace_queue() ast_source = astor.to_source(tree) writefile(normalised_target_file, ast_source) except KeyboardInterrupt: print("Keyboard interrupt, saving...") complete() sys.exit() except: print("Failed to process normalisation for file %s" % filename) print(sys.exc_info()[0]) error = True if len(python_files) == 1: raise if not error: processed_files.append(filename) complete()
def id_distance(path): types = ["var", "function", "Class", "attribute", "arg"] def type(target_name): for t in types: if target_name.startswith(t): return t return None def process_queue(walker): position_maps = {} for scope, typenames, candidate, class_scope in walker.queue: name = candidate.id if isinstance(candidate, ast.Name) else candidate.attr currentPosition = (candidate.lineno, candidate.col_offset) target_name = walker.lookup_name(scope, typenames, name) if target_name is None and class_scope: for s in walker.linked_class_scopes.get(scope, []): target_name = walker.lookup_name(s, typenames, name) if target_name is not None: def_position = walker.name_mapping[target_name] vartype = type(target_name) if def_position not in position_maps: position_maps[(def_position, vartype)] = [] position_maps[(def_position, vartype)].append(currentPosition) return position_maps def calc_distances(source, targets, tokens): if source not in tokens: return [] source_i = tokens.index(source) def dist(t): if t not in tokens: return -1 t_i = tokens.index(t) return abs(t_i - source_i) return [x for x in [dist(t) for t in targets] if x >= 0] type_distances = {} for t in types: type_distances[t] = [] with open(os.path.join(path, "valid_files.txt")) as l: files = l.readlines() for i, file in enumerate(files): if i % 100 == 0: print(file) source, tree = get_source_tree(os.path.join(path, file.strip())) tokens = tokenize.generate_tokens(StringIO(source).readline) tokens = [start for _, _, start, _, _ in tokens] walker = astwalker.ASTWalker() walker.walk(tree) pos_maps = process_queue(walker) for k in pos_maps: type_distances[k[1]].extend( calc_distances(k[0], pos_maps[k], tokens)) for t in type_distances: data = np.array(type_distances[t]) print(t) print("Minimum: %d" % np.min(data)) print("Q1: %d" % np.percentile(data, 25)) print("Median: %d" % np.percentile(data, 50)) print("Q3: %d" % np.percentile(data, 75)) print("90th Percentile: %d" % np.percentile(data, 90)) print("Maximum: %d" % np.max(data)) print()