Example #1
0
def read_data(path, listfile, word_to_id=None, gen_def_positions=True):
    if isinstance(listfile, list):
        python_files = [os.path.join(path, f) for f in listfile]
    else:
        with open(listfile, encoding='utf-8') as f:
            python_files = [
                os.path.join(path, x) for x in f.read().splitlines()
            ]

    mapping = (lambda x: x) if word_to_id is None else (
        lambda x: word_to_id.get(x, oov_id))

    data = []
    definition_positions = []
    identifier_usage = []
    for i, filename in enumerate(python_files):
        try:
            source, tree = get_source_tree(filename)
            tokens = tokenize.generate_tokens(StringIO(source).readline)

            data.append([
                (mapping(preprocess(tokenType, tokenVal)), start)
                for tokenType, tokenVal, start, _, _ in tokens
                if tokenType != tokenize.COMMENT and not tokenVal.startswith(
                    "'''") and not tokenVal.startswith('"""') and (
                        tokenType == tokenize.DEDENT or tokenVal != "")
            ])

            if gen_def_positions:
                walker = astwalker.ASTWalker()
                walker.walk(tree)
                definition_positions.append(walker.definition_positions)
                identifier_usage.append(walker.name_usage)
            # print('정상.. {}/{}'.format(i, python_files.__len__()))
        except:
            print('{}/{}'.format(i, python_files.__len__()))
            print("Error when tokenizing %s: %s" %
                  (filename, sys.exc_info()[0]))

    return data, definition_positions, identifier_usage
Example #2
0
def count_variables(path):

    subdirectories = [
        os.path.join(path, o) for o in os.listdir(path)
        if os.path.isdir(os.path.join(path, o))
    ]

    python_files = [(directory, [
        y for x in os.walk(directory)
        for y in iglob(os.path.join(x[0], '*.py'))
    ]) for directory in subdirectories]

    max_count = {}
    max_file = {}
    syntax_error_count = 0

    for pair in python_files:
        for file in pair[1]:
            try:
                source, tree = get_source_tree(file)
                walker = astwalker.ASTWalker()
                walker.walk(tree)
                counters = {}
                for scope in walker.names:
                    for typename in walker.names[scope]:
                        counters[typename] = counters.get(typename, 0) + len(
                            walker.names[scope][typename])

                for typename in counters:
                    if counters[typename] > max_count.get(typename, 0):
                        max_count[typename] = counters[typename]
                        max_file[typename] = file
            except SyntaxError:
                syntax_error_count += 1
            except:
                print("Skipping file %s due to %s" % (file, sys.exc_info()[0]))

    print("%d syntax errors" % syntax_error_count)
    print(max_count)
    print(max_file)
Example #3
0
def normalise(path):

    split = os.path.split(path)
    base = split[0]
    dirname = split[1]

    normalised_target_path = os.path.join(base, dirname + "_normalised")
    processed_file_path = os.path.join(path, "processed.txt")

    print("Writing normalised files to %s" % normalised_target_path)

    python_files = [y[len(path)+1:] for x in os.walk(path) for y in iglob(os.path.join(x[0], '*.py'))]

    # For debugging
    # python_files = ["debug/test.py"]
    # python_files = ["web2py/gluon/contrib/memcache/memcache.py"]

    processed_files = []
    initial_processed = 0
    syntax_errors = []
    filenotfound_errors = []
    errors = []
    skipped = []

    if os.path.exists(processed_file_path):
        print("Found processed files from previous session, continuing...")
        with open(processed_file_path) as p:
            processed_files = p.read().splitlines()
            initial_processed = len(processed_files)

    def complete():
        write_output(processed_file_path, processed_files)
        print("Processed files: %d\nSyntax errors: %d\nFile not found errors: %d\nOther errors: %d\nSkipped: %d" %
              (len(processed_files) - initial_processed, len(syntax_errors),
               len(filenotfound_errors), len(errors), len(skipped)))

    for filename in python_files:
        if filename in processed_files:
            skipped.append(filename)
            continue

        error = False
        try:
            input_file = os.path.join(path, filename)
            normalised_target_file = os.path.join(normalised_target_path, filename)
            source, tree = get_source_tree(input_file)
        except SyntaxError:
            syntax_errors.append(filename)
            continue
        except FileNotFoundError:
            filenotfound_errors.append(filename)
            continue
        except KeyboardInterrupt:
            print("Keyboard interrupt, saving...")
            complete()
            sys.exit()
        except:
            print("Failed to parse %s due to %s" % (filename, sys.exc_info()[0]))
            errors.append((filename, sys.exc_info()[0]))
            continue

        # AST variable replacement and formatting
        try:
            walker = astwalker.ASTWalker()
            # walker.randomise = False  # For debugging
            walker.walk(tree)
            walker.process_replace_queue()
            ast_source = astor.to_source(tree)
            writefile(normalised_target_file, ast_source)
        except KeyboardInterrupt:
            print("Keyboard interrupt, saving...")
            complete()
            sys.exit()
        except:
            print("Failed to process normalisation for file %s" % filename)
            print(sys.exc_info()[0])
            error = True
            if len(python_files) == 1:
                raise

        if not error:
            processed_files.append(filename)

    complete()
Example #4
0
def id_distance(path):
    types = ["var", "function", "Class", "attribute", "arg"]

    def type(target_name):
        for t in types:
            if target_name.startswith(t):
                return t
        return None

    def process_queue(walker):
        position_maps = {}
        for scope, typenames, candidate, class_scope in walker.queue:
            name = candidate.id if isinstance(candidate,
                                              ast.Name) else candidate.attr
            currentPosition = (candidate.lineno, candidate.col_offset)
            target_name = walker.lookup_name(scope, typenames, name)
            if target_name is None and class_scope:
                for s in walker.linked_class_scopes.get(scope, []):
                    target_name = walker.lookup_name(s, typenames, name)

            if target_name is not None:
                def_position = walker.name_mapping[target_name]
                vartype = type(target_name)
                if def_position not in position_maps:
                    position_maps[(def_position, vartype)] = []
                position_maps[(def_position, vartype)].append(currentPosition)

        return position_maps

    def calc_distances(source, targets, tokens):
        if source not in tokens:
            return []

        source_i = tokens.index(source)

        def dist(t):
            if t not in tokens:
                return -1
            t_i = tokens.index(t)
            return abs(t_i - source_i)

        return [x for x in [dist(t) for t in targets] if x >= 0]

    type_distances = {}
    for t in types:
        type_distances[t] = []

    with open(os.path.join(path, "valid_files.txt")) as l:
        files = l.readlines()
        for i, file in enumerate(files):
            if i % 100 == 0: print(file)
            source, tree = get_source_tree(os.path.join(path, file.strip()))
            tokens = tokenize.generate_tokens(StringIO(source).readline)
            tokens = [start for _, _, start, _, _ in tokens]
            walker = astwalker.ASTWalker()
            walker.walk(tree)
            pos_maps = process_queue(walker)

            for k in pos_maps:
                type_distances[k[1]].extend(
                    calc_distances(k[0], pos_maps[k], tokens))

    for t in type_distances:
        data = np.array(type_distances[t])
        print(t)
        print("Minimum: %d" % np.min(data))
        print("Q1: %d" % np.percentile(data, 25))
        print("Median: %d" % np.percentile(data, 50))
        print("Q3: %d" % np.percentile(data, 75))
        print("90th Percentile: %d" % np.percentile(data, 90))
        print("Maximum: %d" % np.max(data))
        print()