def tokenize_errored_file(file, file_orig, error): spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file)) token_started = False from_token = -1 to_token = -1 count = 0 tokens_errored = [] n_lines = 5 for token, space in zip(tokens, spaces): if not token_started and int(error['line']) == token.position[0]: token_started = True tokens_errored.append(f'<{error["type"]}>') from_token = count if token_started and int(error['line']) < token.position[0]: token_started = False tokens_errored.append(f'</{error["type"]}>') to_token = count if token.position[0] >= int( error['line']) - n_lines and token.position[0] <= int( error['line']) + n_lines: tokens_errored.append(get_token_value(token)) tokens_errored.append(get_space_value(space)) count += 1 if from_token == -1: tokens_errored.append(f'<{error["type"]}>') tokens_errored.append(f'</{error["type"]}>') spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig)) tokens_correct = [] for token, space in zip(tokens[from_token:to_token], spaces[from_token:to_token]): tokens_correct.append(get_token_value(token)) tokens_correct.append(get_space_value(space)) return tokens_errored, tokens_correct
def tokenize_errored_file_model2(file, file_orig, error): # else: # for token, space in zip(tokens[start:end], spaces[start:end]): # tokens_errored.append(get_token_value(token)) # tokens_errored.append(get_space_value(space)) # tokens_errored.append(f'<{error["type"]}>') # tokens_errored.append(f'</{error["type"]}>') tokens_errored, info = tokenize_file_to_repair(file, error) tokens_errored_in_tag = info['tokens_errored_in_tag'] from_token = info['from_token'] to_token = info['to_token'] spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_orig)) tokens_correct = [] for token, space in zip(tokens[from_token:to_token], spaces[from_token:to_token]): tokens_correct.append(get_token_value(token)) tokens_correct.append(get_space_value(space)) if len(tokens_errored_in_tag) != len(tokens_correct): print("WHAAAAATT") info['count_diff'] = 0 for t_A, t_B in zip(tokens_errored_in_tag, tokens_correct): if t_A != t_B: info['count_diff'] += 1 return tokens_errored, tokens_correct, tokens_errored_in_tag, info
def vectorize_file(path, vectorizer): spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(path)) result = [] for ws, t in zip(spaces, tokens): result.append(vectorizer(ws, t)) return result
def build_vocabulary(files): count = {} tokenized_files = [ jlu.tokenize_with_white_space(jlu.open_file(path)) for path in files ] whitespace_id = set() threshold = 30 for spaces, tokens in tokenized_files: whitespace_id = set(spaces) | whitespace_id for token in tokens: name = get_token_value(token) if not name in count: count[name] = 0 count[name] += 1 litterals = list(filter(lambda key: count[key] >= threshold, count.keys())) litterals = { key: value for key, value in zip(litterals, range(len(litterals))) } whitespace_id = { key: value for key, value in zip(whitespace_id, range(len(whitespace_id))) } len_litterals = len(litterals) len_whitespace = len(whitespace_id) vec_size = len_litterals + 1 + len_whitespace def get_vector(space, token): vector = np.array([0] * vec_size) if get_token_value(token) in litterals: vector[litterals[get_token_value(token)]] = 1 else: vector[len_litterals] = 1 vector[len_litterals + 1 + whitespace_id[space]] = 1 return vector print(litterals.keys()) return get_vector, whitespace_id
def tokenize_file_to_repair(file_path, error): spaces, tokens = jlu.tokenize_with_white_space(jlu.open_file(file_path)) info = {} token_started = False token_line_start = -1 token_line_end = -1 count = 0 tokens_errored = [] n_lines = 6 start = len(tokens) end = 0 from_token = 0 to_token = 0 for token, space in zip(tokens, spaces): if token.position[0] >= int( error['line']) - n_lines and token.position[0] <= int( error['line']) + n_lines: start = min(count, start) end = max(count, end) if not token_started and int(error['line']) == token.position[0]: token_started = True token_line_start = count if token_started and int(error['line']) < token.position[0]: token_started = False token_line_end = count count += 1 start = max(0, start - 2) end = min(len(tokens), end + 2) if token_line_end == -1: token_line_end = token_line_start # print(error) if 'column' in error and error['type'] != 'OneStatementPerLine': errored_token_index = -1 around = 10 for token, index in zip(tokens, range(len(tokens))): if token.position[0] <= int( error['line']) and token.position[1] <= int( error['column']): errored_token_index = index from_token = max(0, errored_token_index - around) to_token = min(len(tokens), errored_token_index + 1 + around) else: around = 2 around_after = 13 errored_token_index = -1 if token_line_start != -1: from_token = max(start, token_line_start - around) to_token = min(end, token_line_end + around_after + 1) else: errored_token_index = -1 around = 2 around_after = 18 for token, index in zip(tokens, range(len(tokens))): if token.position[0] < int(error['line']): errored_token_index = index from_token = max(0, errored_token_index - around) to_token = min(len(tokens), errored_token_index + 1 + around_after) tokens_errored_in_tag = [] for token, space in zip(tokens[from_token:to_token], spaces[from_token:to_token]): tokens_errored_in_tag.append(get_token_value(token)) tokens_errored_in_tag.append(get_space_value(space)) for token, space in zip(tokens[start:from_token], spaces[start:from_token]): tokens_errored.append(get_token_value(token)) tokens_errored.append(get_space_value(space)) tokens_errored.append(f'<{error["type"]}>') for token, space in zip(tokens[from_token:to_token], spaces[from_token:to_token]): tokens_errored.append(get_token_value(token)) tokens_errored.append(get_space_value(space)) tokens_errored.append(f'</{error["type"]}>') for token, space in zip(tokens[to_token:end], spaces[to_token:end]): tokens_errored.append(get_token_value(token)) tokens_errored.append(get_space_value(space)) info['from_token'] = from_token info['to_token'] = to_token info['start'] = start info['end'] = end info['error'] = error info['tokens_errored_in_tag'] = tokens_errored_in_tag return tokens_errored, info