Ejemplo n.º 1
0
def get_data(path, listfile, seq_length, word_to_id):
    all_data, def_positions, identifier_usage = read_data(
        path, listfile, word_to_id)
    identifier_types = astwalker.possible_types()
    num_masks = len(identifier_types)
    def_positions = [[[t[1] for t in fp if t[0] == k]
                      for k in identifier_types] for fp in def_positions]
    # def_positions = [[t[1] for t in file_positions] for file_positions in def_positions]
    num_masks = len(identifier_types)
    #num_masks = 1
    data = []

    # Prevent indent and dedent tokens from being flagged as variables, which can occur because dedent
    # in particular takes up no columns
    non_vars = [word_to_id[indent_token], word_to_id[dedent_token]]

    for j in range(len(all_data)):
        filedata = list(all_data[j])
        file_positions = def_positions[j]
        identifier_positions = identifier_usage[j]

        num_sequences = math.ceil(len(filedata) / seq_length)
        input_data = np.zeros([num_sequences, seq_length]).astype("int64")
        targets = np.zeros([num_sequences, seq_length]).astype("int64")
        masks_data = np.zeros([num_sequences, num_masks,
                               seq_length]).astype("bool")
        id_usage_data = np.zeros([num_sequences, seq_length]).astype("bool")
        actual_lengths = []

        for i in range(num_sequences):
            x = [t[0] for t in filedata[i * seq_length:(i + 1) * seq_length]]
            y = [
                t[0]
                for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]
            ]
            masks = [[
                t[1] in fp and t[0] not in non_vars for fp in file_positions
            ] for t in filedata[i * seq_length:(i + 1) * seq_length]]
            # masks = [t[1] in file_positions and t[0] not in non_vars
            #          for t in filedata[i * seq_length:(i + 1) * seq_length]]
            ids = [
                t[1] in identifier_positions
                for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]
            ]

            actual_length_x = len(x)
            actual_length_y = len(y)

            input_data[i, :actual_length_x] = x
            targets[i, :actual_length_y] = y
            masks_data[i, :, :actual_length_x] = np.transpose(masks)
            id_usage_data[i, :actual_length_y] = ids

            actual_lengths.append(actual_length_y)

        container = Container(input_data, targets, actual_lengths, masks_data,
                              id_usage_data)
        data.append(container)

    return data
Ejemplo n.º 2
0
def get_data(path, listfile, seq_length, word_to_id):
    all_data, def_positions, identifier_usage = read_data(path, listfile, word_to_id)
    identifier_types = astwalker.possible_types()
    num_masks = len(identifier_types)
    def_positions = [[[t[1] for t in fp if t[0] == k] for k in identifier_types] for fp in def_positions]
    # def_positions = [[t[1] for t in file_positions] for file_positions in def_positions]
    num_masks = len(identifier_types)
    #num_masks = 1
    data = []

    # Prevent indent and dedent tokens from being flagged as variables, which can occur because dedent
    # in particular takes up no columns
    non_vars = [word_to_id[indent_token], word_to_id[dedent_token]]

    for j in range(len(all_data)):
        filedata = list(all_data[j])
        file_positions = def_positions[j]
        identifier_positions = identifier_usage[j]

        num_sequences = math.ceil(len(filedata) / seq_length)
        input_data = np.zeros([num_sequences, seq_length]).astype("int64")
        targets = np.zeros([num_sequences, seq_length]).astype("int64")
        masks_data = np.zeros([num_sequences, num_masks, seq_length]).astype("bool")
        id_usage_data = np.zeros([num_sequences, seq_length]).astype("bool")
        actual_lengths = []

        for i in range(num_sequences):
            x = [t[0] for t in filedata[i * seq_length:(i + 1) * seq_length]]
            y = [t[0] for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]]
            masks = [[t[1] in fp and t[0] not in non_vars for fp in file_positions]
                     for t in filedata[i * seq_length:(i + 1) * seq_length]]
            # masks = [t[1] in file_positions and t[0] not in non_vars
            #          for t in filedata[i * seq_length:(i + 1) * seq_length]]
            ids = [t[1] in identifier_positions for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]]

            actual_length_x = len(x)
            actual_length_y = len(y)

            input_data[i, :actual_length_x] = x
            targets[i, :actual_length_y] = y
            masks_data[i, :, :actual_length_x] = np.transpose(masks)
            id_usage_data[i, :actual_length_y] = ids

            actual_lengths.append(actual_length_y)

        container = Container(input_data, targets, actual_lengths, masks_data, id_usage_data)
        data.append(container)

    return data
Ejemplo n.º 3
0
def map_token(map, token):
    mask = 0
    if token.startswith("(*) "):
        mask = 1
        token = token.replace("(*) ", "")

    if token in map:
        return map[token], mask

    # Not in map, is it an identifier?
    if "|" in token:
        spl = token.split("|")
        if spl[1] in map:
            return map[spl[1]]
        elif spl[0] in map:
            return map[spl[0]]

    elif any([token.startswith(prefix) for prefix in astwalker.possible_types()]):
        return pyreader.oov_id

    raise KeyError(token)
Ejemplo n.º 4
0
def map_token(map, token):
    mask = 0
    if token.startswith("(*) "):
        mask = 1
        token = token.replace("(*) ", "")

    if token in map:
        return map[token], mask

    # Not in map, is it an identifier?
    if "|" in token:
        spl = token.split("|")
        if spl[1] in map:
            return (map[spl[1]],)
        elif spl[0] in map:
            return (map[spl[0]],)

    elif any([token.startswith(prefix) for prefix in astwalker.possible_types()]):
        return (pyreader.oov_id,)

    raise KeyError(token)
Ejemplo n.º 5
0
 def is_identifier(token_id):
     token = self.inv_map[token_id]
     if any(token.startswith(p) for p in astwalker.possible_types()):
         return 1
     return 0
Ejemplo n.º 6
0
 def is_identifier(token_id):
     token = self.inv_map[token_id]
     if any(token.startswith(p) for p in astwalker.possible_types()):
         return 1
     return 0
Ejemplo n.º 7
0
def adjust_flags():
    if FLAGS.attention:
        FLAGS.attention = FLAGS.attention.split("+")
        if "identifiers" in FLAGS.attention:
            FLAGS.attention.extend(["identifiers"] * (len(astwalker.possible_types()) - 1))
Ejemplo n.º 8
0
def adjust_flags():
    if FLAGS.attention:
        FLAGS.attention = FLAGS.attention.split("+")
        if "identifiers" in FLAGS.attention:
            FLAGS.attention.extend(["identifiers"] *
                                   (len(astwalker.possible_types()) - 1))