def get_data(path, listfile, seq_length, word_to_id): all_data, def_positions, identifier_usage = read_data( path, listfile, word_to_id) identifier_types = astwalker.possible_types() num_masks = len(identifier_types) def_positions = [[[t[1] for t in fp if t[0] == k] for k in identifier_types] for fp in def_positions] # def_positions = [[t[1] for t in file_positions] for file_positions in def_positions] num_masks = len(identifier_types) #num_masks = 1 data = [] # Prevent indent and dedent tokens from being flagged as variables, which can occur because dedent # in particular takes up no columns non_vars = [word_to_id[indent_token], word_to_id[dedent_token]] for j in range(len(all_data)): filedata = list(all_data[j]) file_positions = def_positions[j] identifier_positions = identifier_usage[j] num_sequences = math.ceil(len(filedata) / seq_length) input_data = np.zeros([num_sequences, seq_length]).astype("int64") targets = np.zeros([num_sequences, seq_length]).astype("int64") masks_data = np.zeros([num_sequences, num_masks, seq_length]).astype("bool") id_usage_data = np.zeros([num_sequences, seq_length]).astype("bool") actual_lengths = [] for i in range(num_sequences): x = [t[0] for t in filedata[i * seq_length:(i + 1) * seq_length]] y = [ t[0] for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1] ] masks = [[ t[1] in fp and t[0] not in non_vars for fp in file_positions ] for t in filedata[i * seq_length:(i + 1) * seq_length]] # masks = [t[1] in file_positions and t[0] not in non_vars # for t in filedata[i * seq_length:(i + 1) * seq_length]] ids = [ t[1] in identifier_positions for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1] ] actual_length_x = len(x) actual_length_y = len(y) input_data[i, :actual_length_x] = x targets[i, :actual_length_y] = y masks_data[i, :, :actual_length_x] = np.transpose(masks) id_usage_data[i, :actual_length_y] = ids actual_lengths.append(actual_length_y) container = Container(input_data, targets, actual_lengths, masks_data, id_usage_data) data.append(container) return data
def get_data(path, listfile, seq_length, word_to_id): all_data, def_positions, identifier_usage = read_data(path, listfile, word_to_id) identifier_types = astwalker.possible_types() num_masks = len(identifier_types) def_positions = [[[t[1] for t in fp if t[0] == k] for k in identifier_types] for fp in def_positions] # def_positions = [[t[1] for t in file_positions] for file_positions in def_positions] num_masks = len(identifier_types) #num_masks = 1 data = [] # Prevent indent and dedent tokens from being flagged as variables, which can occur because dedent # in particular takes up no columns non_vars = [word_to_id[indent_token], word_to_id[dedent_token]] for j in range(len(all_data)): filedata = list(all_data[j]) file_positions = def_positions[j] identifier_positions = identifier_usage[j] num_sequences = math.ceil(len(filedata) / seq_length) input_data = np.zeros([num_sequences, seq_length]).astype("int64") targets = np.zeros([num_sequences, seq_length]).astype("int64") masks_data = np.zeros([num_sequences, num_masks, seq_length]).astype("bool") id_usage_data = np.zeros([num_sequences, seq_length]).astype("bool") actual_lengths = [] for i in range(num_sequences): x = [t[0] for t in filedata[i * seq_length:(i + 1) * seq_length]] y = [t[0] for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]] masks = [[t[1] in fp and t[0] not in non_vars for fp in file_positions] for t in filedata[i * seq_length:(i + 1) * seq_length]] # masks = [t[1] in file_positions and t[0] not in non_vars # for t in filedata[i * seq_length:(i + 1) * seq_length]] ids = [t[1] in identifier_positions for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]] actual_length_x = len(x) actual_length_y = len(y) input_data[i, :actual_length_x] = x targets[i, :actual_length_y] = y masks_data[i, :, :actual_length_x] = np.transpose(masks) id_usage_data[i, :actual_length_y] = ids actual_lengths.append(actual_length_y) container = Container(input_data, targets, actual_lengths, masks_data, id_usage_data) data.append(container) return data
def map_token(map, token): mask = 0 if token.startswith("(*) "): mask = 1 token = token.replace("(*) ", "") if token in map: return map[token], mask # Not in map, is it an identifier? if "|" in token: spl = token.split("|") if spl[1] in map: return map[spl[1]] elif spl[0] in map: return map[spl[0]] elif any([token.startswith(prefix) for prefix in astwalker.possible_types()]): return pyreader.oov_id raise KeyError(token)
def map_token(map, token): mask = 0 if token.startswith("(*) "): mask = 1 token = token.replace("(*) ", "") if token in map: return map[token], mask # Not in map, is it an identifier? if "|" in token: spl = token.split("|") if spl[1] in map: return (map[spl[1]],) elif spl[0] in map: return (map[spl[0]],) elif any([token.startswith(prefix) for prefix in astwalker.possible_types()]): return (pyreader.oov_id,) raise KeyError(token)
def is_identifier(token_id): token = self.inv_map[token_id] if any(token.startswith(p) for p in astwalker.possible_types()): return 1 return 0
def adjust_flags(): if FLAGS.attention: FLAGS.attention = FLAGS.attention.split("+") if "identifiers" in FLAGS.attention: FLAGS.attention.extend(["identifiers"] * (len(astwalker.possible_types()) - 1))