Ejemplo n.º 1
0
def main(args):
    dataset = Dataset(args['TAR_FILES'])
    code_line_file = open(args['OUTPUT_CODE_FILE'], 'w')
    all_preserved_tokens = set()
    for example in dataset.get_iterator(num_workers=5):
        code = example.ast.code
        # code_tokens = tokenize_raw_code(code)
        # preserved_tokens = [token for token in code_tokens if token.startswith('@@') and token.endswith('@@')]
        # all_preserved_tokens.update(preserved_tokens)

        # code_line_file.write(' '.join(code_tokens) + '\n')

    code_line_file.close()

    with open(args['OUTPUT_CODE_FILE'] + '.preserved_tokens.txt', 'w') as f:
        for token in all_preserved_tokens:
            f.write(token + '\n')
Ejemplo n.º 2
0
    args = docopt(__doc__)
    vocab_size = int(args['--size'])
    vocab_file = args['VOCAB_FILE']
    train_set = Dataset(args['TRAIN_FILE'])

    src_code_tokens_file = vocab_file + '.src_code_tokens.txt'
    src_preserved_tokens = set()
    f_src_token = open(src_code_tokens_file, 'w')

    # extract vocab and node types
    node_types = set()
    src_words = []
    tgt_words = []
    identifier_names = []
    type_tokens = []
    for example in train_set.get_iterator(progress=True, num_workers=5):
        for node in example.ast:
            node_types.add(node.node_type)

            if node.is_variable_node:
                old_var_name = node.old_name
                new_var_name = node.new_name

                src_words.append(old_var_name)

                if old_var_name != new_var_name:
                    tgt_words.append(new_var_name)

            if node.node_type == 'obj' or node.node_type == 'block' and hasattr(
                    node, 'name'):
                identifier_names.append(node.name)