コード例 #1
0
def load_dataset(transition_system, dataset_file, reorder_predicates=True):
    examples = []
    for idx, line in enumerate(open(dataset_file)):
        src_query, tgt_code = line.strip().split('\t')

        src_query_tokens = src_query.split(' ')

        lf = parse_lambda_expr(tgt_code)
        assert lf.to_string() == tgt_code

        if reorder_predicates:
            ordered_lf = get_canonical_order_of_logical_form(
                lf, order_by='alphabet')
            assert ordered_lf == lf
            lf = ordered_lf

        gold_source = lf.to_string()

        tgt_ast = logical_form_to_ast(grammar, lf)
        reconstructed_lf = ast_to_logical_form(tgt_ast)
        assert lf == reconstructed_lf

        tgt_actions = transition_system.get_actions(tgt_ast)

        print(idx)
        print('Utterance: %s' % src_query)
        print('Reference: %s' % tgt_code)
        # print('===== Actions =====')
        # sanity check
        hyp = Hypothesis()
        for action in tgt_actions:
            assert action.__class__ in transition_system.get_valid_continuation_types(
                hyp)
            if isinstance(action, ApplyRuleAction):
                assert action.production in transition_system.get_valid_continuating_productions(
                    hyp)
            hyp = hyp.clone_and_apply_action(action)
            # print(action)

        assert hyp.frontier_node is None and hyp.frontier_field is None

        src_from_hyp = transition_system.ast_to_surface_code(hyp.tree)
        assert src_from_hyp == gold_source

        tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions)

        # print(' '.join(src_query_tokens))
        print('***')
        print(lf.to_string())
        print()
        example = Example(idx=idx,
                          src_sent=src_query_tokens,
                          tgt_actions=tgt_action_infos,
                          tgt_code=gold_source,
                          tgt_ast=tgt_ast,
                          meta=None)

        examples.append(example)

    return examples
コード例 #2
0
    def load_regex_dataset(transition_system, split):
        prefix = 'data/regex/'
        src_file = join(prefix, "src-{}.txt".format(split))
        spec_file = join(prefix, "spec-{}.txt".format(split))

        examples = []
        for idx, (src_line,
                  spec_line) in enumerate(zip(open(src_file),
                                              open(spec_file))):
            print(idx)

            src_line = src_line.rstrip()
            spec_line = spec_line.rstrip()
            src_toks = src_line.split()

            spec_toks = spec_line.rstrip().split()
            spec_ast = regex_expr_to_ast(transition_system.grammar, spec_toks)

            # sanity check
            reconstructed_expr = transition_system.ast_to_surface_code(
                spec_ast)
            print(spec_line, reconstructed_expr)
            assert spec_line == reconstructed_expr

            tgt_actions = transition_system.get_actions(spec_ast)

            # sanity check
            hyp = Hypothesis()
            for action in tgt_actions:
                assert action.__class__ in transition_system.get_valid_continuation_types(
                    hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in transition_system.get_valid_continuating_productions(
                        hyp)
                hyp = hyp.clone_and_apply_action(action)

            assert hyp.frontier_node is None and hyp.frontier_field is None
            assert is_equal_ast(hyp.tree, spec_ast)

            expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree)
            assert expr_from_hyp == spec_line

            tgt_action_infos = get_action_infos(src_toks, tgt_actions)
            example = Example(idx=idx,
                              src_sent=src_toks,
                              tgt_actions=tgt_action_infos,
                              tgt_code=spec_line,
                              tgt_ast=spec_ast,
                              meta=None)

            examples.append(example)
        return examples
コード例 #3
0
def load_dataset(transition_system, path, num, reorder_predicates=True):
    grammar = transition_system.grammar

    examples = []
    pre_len = 0
    if os.path.exists('data/pdf/train.bin'):
        examples = pickle.load(open('data/pdf/train.bin', 'rb'))
        pre_len = len(examples)

    idx = 0
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        print(item)

        try:
            pdf = PdfReader(item_path)
        except:
            continue

        for page in pdf.pages:
            idx += 1
            if idx <= pre_len:
                continue
            print(idx)

            try:
                tgt_ast = pdf_to_ast(grammar, page, [])
            except:
                continue

            tgt_actions = transition_system.get_actions(tgt_ast)

            """
            hyp = Hypothesis()
            for action in tgt_actions:
                assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in transition_system.get_valid_continuating_productions(hyp)
                hyp = hyp.clone_and_apply_action(action)
            assert hyp.frontier_node is None and hyp.frontier_field is None 
            """

            tgt_action_infos = get_action_infos(tgt_actions)

            example = Example(idx=idx, tgt_actions=tgt_action_infos, meta=None)

            examples.append(example)

        if idx >= num:
            break

    return examples
コード例 #4
0
ファイル: dataset.py プロジェクト: Amirutha/tranX
def load_dataset(transition_system, dataset_file):
    examples = []
    for idx, line in enumerate(open(dataset_file)):
        print(line)
        src_query, tgt_code = line.strip().split('~')

        tgt_code = tgt_code.replace("("," ( ")
        tgt_code = tgt_code.replace(")"," ) ")
        tgt_code = " ".join(tgt_code.split())
        src_query = src_query.replace("(","")
        src_query = src_query.replace(")","")
        src_query_tokens = src_query.split(' ')

        tgt_ast = lisp_expr_to_ast(transition_system.grammar, tgt_code)
        reconstructed_lisp_expr = ast_to_lisp_expr(tgt_ast)
        assert tgt_code == reconstructed_lisp_expr

        tgt_actions = transition_system.get_actions(tgt_ast)

        # sanity check
        hyp = Hypothesis()
        for action in tgt_actions:
            assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
            if isinstance(action, ApplyRuleAction):
                assert action.production in transition_system.get_valid_continuating_productions(hyp)
            hyp = hyp.clone_and_apply_action(action)

        assert hyp.frontier_node is None and hyp.frontier_field is None

        assert is_equal_ast(hyp.tree, tgt_ast)

        expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree)
        assert expr_from_hyp == tgt_code

        tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions)

        print(idx)
        example = Example(idx=idx,
                          src_sent=src_query_tokens,
                          tgt_actions=tgt_action_infos,
                          tgt_code=tgt_code,
                          tgt_ast=tgt_ast,
                          meta=None)

        examples.append(example)

    return examples
コード例 #5
0
ファイル: dataset.py プロジェクト: rogeriochaves/tranX
    def parse_django_dataset(annot_file,
                             code_file,
                             asdl_file_path,
                             max_query_len=70,
                             vocab_freq_cutoff=10):
        asdl_text = open(asdl_file_path).read()
        grammar = ASDLGrammar.from_text(asdl_text)
        transition_system = PythonTransitionSystem(grammar)

        loaded_examples = []

        from components.vocab import Vocab, VocabEntry
        from components.dataset import Example

        for idx, (src_query,
                  tgt_code) in enumerate(zip(open(annot_file),
                                             open(code_file))):
            src_query = src_query.strip()
            tgt_code = tgt_code.strip()

            src_query_tokens, tgt_canonical_code, str_map = Django.canonicalize_example(
                src_query, tgt_code)
            python_ast = ast.parse(tgt_canonical_code).body[0]
            gold_source = astor.to_source(python_ast).strip()
            tgt_ast = python_ast_to_asdl_ast(python_ast, grammar)
            tgt_actions = transition_system.get_actions(tgt_ast)

            # print('+' * 60)
            # print('Example: %d' % idx)
            # print('Source: %s' % ' '.join(src_query_tokens))
            # if str_map:
            #     print('Original String Map:')
            #     for str_literal, str_repr in str_map.items():
            #         print('\t%s: %s' % (str_literal, str_repr))
            # print('Code:\n%s' % gold_source)
            # print('Actions:')

            # sanity check
            try:
                hyp = Hypothesis()
                for t, action in enumerate(tgt_actions):
                    # assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
                    # if isinstance(action, ApplyRuleAction):
                    #     assert action.production in transition_system.get_valid_continuating_productions(hyp)

                    p_t = -1
                    f_t = None
                    if hyp.frontier_node:
                        p_t = hyp.frontier_node.created_time
                        f_t = hyp.frontier_field.field.__repr__(plain=True)

                    # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t))
                    hyp = hyp.clone_and_apply_action(action)

                assert hyp.frontier_node is None and hyp.frontier_field is None

                src_from_hyp = astor.to_source(
                    asdl_ast_to_python_ast(hyp.tree, grammar)).strip()
                assert src_from_hyp == gold_source

                # print('+' * 60)
            except:
                continue

            loaded_examples.append({
                'src_query_tokens': src_query_tokens,
                'tgt_canonical_code': gold_source,
                'tgt_ast': tgt_ast,
                'tgt_actions': tgt_actions,
                'raw_code': tgt_code,
                'str_map': str_map
            })

            # print('first pass, processed %d' % idx, file=sys.stderr)

        train_examples = []
        dev_examples = []
        test_examples = []

        action_len = []

        for idx, e in enumerate(loaded_examples):
            src_query_tokens = e['src_query_tokens'][:max_query_len]
            tgt_actions = e['tgt_actions']
            tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions)

            example = Example(idx=idx,
                              src_sent=src_query_tokens,
                              tgt_actions=tgt_action_infos,
                              tgt_code=e['tgt_canonical_code'],
                              tgt_ast=e['tgt_ast'],
                              meta={
                                  'raw_code': e['raw_code'],
                                  'str_map': e['str_map']
                              })

            # print('second pass, processed %d' % idx, file=sys.stderr)

            action_len.append(len(tgt_action_infos))

            # train, valid, test split
            if 0 <= idx < 16000:
                train_examples.append(example)
            elif 16000 <= idx < 17000:
                dev_examples.append(example)
            else:
                test_examples.append(example)

        print('Max action len: %d' % max(action_len), file=sys.stderr)
        print('Avg action len: %d' % np.average(action_len), file=sys.stderr)
        print('Actions larger than 100: %d' %
              len(list(filter(lambda x: x > 100, action_len))),
              file=sys.stderr)

        src_vocab = VocabEntry.from_corpus(
            [e.src_sent for e in train_examples],
            size=5000,
            freq_cutoff=vocab_freq_cutoff)

        primitive_tokens = [
            map(
                lambda a: a.action.token,
                filter(lambda a: isinstance(a.action, GenTokenAction),
                       e.tgt_actions)) for e in train_examples
        ]

        primitive_vocab = VocabEntry.from_corpus(primitive_tokens,
                                                 size=5000,
                                                 freq_cutoff=vocab_freq_cutoff)
        assert '_STR:0_' in primitive_vocab

        # generate vocabulary for the code tokens!
        code_tokens = [
            tokenize_code(e.tgt_code, mode='decoder') for e in train_examples
        ]
        code_vocab = VocabEntry.from_corpus(code_tokens,
                                            size=5000,
                                            freq_cutoff=vocab_freq_cutoff)

        vocab = Vocab(source=src_vocab,
                      primitive=primitive_vocab,
                      code=code_vocab)
        print('generated vocabulary %s' % repr(vocab), file=sys.stderr)

        return (train_examples, dev_examples, test_examples), vocab
コード例 #6
0
ファイル: dataset.py プロジェクト: zkcpku/tranX
def preprocess_dataset(file_path,
                       transition_system,
                       name='train',
                       firstk=None):
    try:
        dataset = json.load(open(file_path))
    except:
        dataset = [json.loads(jline) for jline in open(file_path).readlines()]
    if firstk:
        dataset = dataset[:firstk]
    examples = []
    evaluator = ConalaEvaluator(transition_system)
    f = open(file_path + '.debug', 'w')
    skipped_list = []
    for i, example_json in enumerate(dataset):
        try:
            example_dict = preprocess_example(example_json)

            python_ast = ast.parse(example_dict['canonical_snippet'])
            canonical_code = astor.to_source(python_ast).strip()
            tgt_ast = python_ast_to_asdl_ast(python_ast,
                                             transition_system.grammar)
            tgt_actions = transition_system.get_actions(tgt_ast)

            # sanity check
            hyp = Hypothesis()
            for t, action in enumerate(tgt_actions):
                assert action.__class__ in transition_system.get_valid_continuation_types(
                    hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in transition_system.get_valid_continuating_productions(
                        hyp)
                # p_t = -1
                # f_t = None
                # if hyp.frontier_node:
                #     p_t = hyp.frontier_node.created_time
                #     f_t = hyp.frontier_field.field.__repr__(plain=True)
                #
                # # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t))
                hyp = hyp.clone_and_apply_action(action)

            assert hyp.frontier_node is None and hyp.frontier_field is None
            hyp.code = code_from_hyp = astor.to_source(
                asdl_ast_to_python_ast(hyp.tree,
                                       transition_system.grammar)).strip()
            # print(code_from_hyp)
            # print(canonical_code)
            assert code_from_hyp == canonical_code

            decanonicalized_code_from_hyp = decanonicalize_code(
                code_from_hyp, example_dict['slot_map'])
            assert compare_ast(ast.parse(example_json['snippet']),
                               ast.parse(decanonicalized_code_from_hyp))
            assert transition_system.compare_ast(
                transition_system.surface_code_to_ast(
                    decanonicalized_code_from_hyp),
                transition_system.surface_code_to_ast(example_json['snippet']))

            tgt_action_infos = get_action_infos(example_dict['intent_tokens'],
                                                tgt_actions)
        except (AssertionError, SyntaxError, ValueError, OverflowError) as e:
            skipped_list.append(example_json['question_id'])
            continue
        example = Example(idx=f'{i}-{example_json["question_id"]}',
                          src_sent=example_dict['intent_tokens'],
                          tgt_actions=tgt_action_infos,
                          tgt_code=canonical_code,
                          tgt_ast=tgt_ast,
                          meta=dict(example_dict=example_json,
                                    slot_map=example_dict['slot_map']))
        assert evaluator.is_hyp_correct(example, hyp)

        examples.append(example)

        # log!
        f.write(f'Example: {example.idx}\n')
        if 'rewritten_intent' in example.meta['example_dict']:
            f.write(
                f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n"
            )
        else:
            f.write(
                f"Original Utterance: {example.meta['example_dict']['intent']}\n"
            )
        f.write(
            f"Original Snippet: {example.meta['example_dict']['snippet']}\n")
        f.write(f"\n")
        f.write(f"Utterance: {' '.join(example.src_sent)}\n")
        f.write(f"Snippet: {example.tgt_code}\n")
        f.write(
            f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
        )

    f.close()
    print('Skipped due to exceptions: %d' % len(skipped_list), file=sys.stderr)
    return examples
コード例 #7
0
    def load_regex_dataset(transition_system, split):
        prefix = 'data/streg/'
        src_file = join(prefix, "src-{}.txt".format(split))
        spec_file = join(prefix, "targ-{}.txt".format(split))
        map_file = join(prefix, "map-{}.txt".format(split))
        exs_file = join(prefix, "exs-{}.txt".format(split))
        rec_file = join(prefix, "rec-{}.pkl".format(split))

        exs_info = StReg.load_examples(exs_file)
        map_info = StReg.load_map_file(map_file)
        rec_info = StReg.load_rec(rec_file)

        examples = []
        for idx, (src_line, spec_line, str_exs, cmap, rec) in enumerate(
                zip(open(src_file), open(spec_file), exs_info, map_info,
                    rec_info)):
            print(idx)

            src_line = src_line.rstrip()
            spec_line = spec_line.rstrip()
            src_toks = src_line.split()

            spec_toks = spec_line.rstrip().split()
            spec_ast = streg_expr_to_ast(transition_system.grammar, spec_toks)
            # sanity check
            reconstructed_expr = transition_system.ast_to_surface_code(
                spec_ast)
            # print("Spec", spec_line)
            # print("Rcon", reconstructed_expr)
            assert spec_line == reconstructed_expr

            tgt_actions = transition_system.get_actions(spec_ast)
            # sanity check
            hyp = Hypothesis()
            for action in tgt_actions:
                assert action.__class__ in transition_system.get_valid_continuation_types(
                    hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in transition_system.get_valid_continuating_productions(
                        hyp)
                hyp = hyp.clone_and_apply_action(action)

            assert hyp.frontier_node is None and hyp.frontier_field is None
            assert is_equal_ast(hyp.tree, spec_ast)

            expr_from_hyp = transition_system.ast_to_surface_code(hyp.tree)
            assert expr_from_hyp == spec_line

            tgt_action_infos = get_action_infos(src_toks, tgt_actions)

            example = Example(idx=idx,
                              src_sent=src_toks,
                              tgt_actions=tgt_action_infos,
                              tgt_code=spec_line,
                              tgt_ast=spec_ast,
                              meta={
                                  "str_exs": str_exs,
                                  "const_map": cmap,
                                  "worker_info": rec
                              })
            examples.append(example)
        return examples
コード例 #8
0
def preprocess_dataset(file_path, transition_system, name='train'):
    dataset = json.load(open(file_path))
    examples = []
    evaluator = ConalaEvaluator(transition_system)

    f = open(file_path + '.debug', 'w')

    for i, example_json in enumerate(dataset):
        example_dict = preprocess_example(example_json)
        if example_json['question_id'] in (18351951, 9497290, 19641579,
                                           32283692):
            pprint(preprocess_example(example_json))
            continue

        python_ast = ast.parse(example_dict['canonical_snippet'])
        canonical_code = astor.to_source(python_ast).strip()
        tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar)
        tgt_actions = transition_system.get_actions(tgt_ast)

        # sanity check
        hyp = Hypothesis()
        for t, action in enumerate(tgt_actions):
            assert action.__class__ in transition_system.get_valid_continuation_types(
                hyp)
            if isinstance(action, ApplyRuleAction):
                assert action.production in transition_system.get_valid_continuating_productions(
                    hyp)

            p_t = -1
            f_t = None
            if hyp.frontier_node:
                p_t = hyp.frontier_node.created_time
                f_t = hyp.frontier_field.field.__repr__(plain=True)

            # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t))
            hyp = hyp.clone_and_apply_action(action)

        assert hyp.frontier_node is None and hyp.frontier_field is None
        hyp.code = code_from_hyp = astor.to_source(
            asdl_ast_to_python_ast(hyp.tree,
                                   transition_system.grammar)).strip()
        assert code_from_hyp == canonical_code

        decanonicalized_code_from_hyp = decanonicalize_code(
            code_from_hyp, example_dict['slot_map'])
        assert compare_ast(ast.parse(example_json['snippet']),
                           ast.parse(decanonicalized_code_from_hyp))
        assert transition_system.compare_ast(
            transition_system.surface_code_to_ast(
                decanonicalized_code_from_hyp),
            transition_system.surface_code_to_ast(example_json['snippet']))

        tgt_action_infos = get_action_infos(example_dict['intent_tokens'],
                                            tgt_actions)

        example = Example(idx=f'{i}-{example_json["question_id"]}',
                          src_sent=example_dict['intent_tokens'],
                          tgt_actions=tgt_action_infos,
                          tgt_code=canonical_code,
                          tgt_ast=tgt_ast,
                          meta=dict(example_dict=example_json,
                                    slot_map=example_dict['slot_map']))
        assert evaluator.is_hyp_correct(example, hyp)

        examples.append(example)

        # log!
        f.write(f'Example: {example.idx}\n')
        f.write(
            f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n"
        )
        f.write(
            f"Original Snippet: {example.meta['example_dict']['snippet']}\n")
        f.write(f"\n")
        f.write(f"Utterance: {' '.join(example.src_sent)}\n")
        f.write(f"Snippet: {example.tgt_code}\n")
        f.write(
            f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
        )

    f.close()

    return examples
コード例 #9
0
def preprocess_dataset(file_path, transition_system, name='train', firstk=None):
    file_path = os.path.join(os.getcwd(), *file_path.split('/' if '/' in file_path else "\\"))

    try:
        dataset = json.load(open(file_path))
    except:
        dataset = [json.loads(jline) for jline in open(file_path).readlines()]
    if firstk:
        dataset = dataset[:firstk]
    examples = []
    evaluator = ConalaEvaluator(transition_system)

    # Author: Gabe
    # Added in encoding to try and deal with UnicodeEncodeErrors
    f = open(file_path + '.debug', 'w', encoding='utf-8')

    skipped_list = []
    for i, example_json in tqdm(enumerate(dataset), file=sys.stdout, total=len(dataset),
                                desc='Preproc'):

        # Author: Gabe
        # Have to skip this one question because it causes the program to hang and never recover.
        if example_json['question_id'] in [39525993]:
            skipped_list.append(example_json['question_id'])
            tqdm.write(f"Skipping {example_json['question_id']} because it causes errors")
            continue
        try:
            example_dict = preprocess_example(example_json)

            python_ast = ast.parse(example_dict['canonical_snippet'])
            canonical_code = astor.to_source(python_ast).strip()
            tgt_ast = python_ast_to_asdl_ast(python_ast, transition_system.grammar)
            tgt_actions = transition_system.get_actions(tgt_ast)

            # sanity check
            hyp = Hypothesis()
            for t, action in enumerate(tgt_actions):
                assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in \
                           transition_system.get_valid_continuating_productions(
                               hyp)
                # p_t = -1
                # f_t = None
                # if hyp.frontier_node:
                #     p_t = hyp.frontier_node.created_time
                #     f_t = hyp.frontier_field.field.__repr__(plain=True)
                #
                # # print('\t[%d] %s, frontier field: %s, parent: %d' % (t, action, f_t, p_t))
                hyp = hyp.clone_and_apply_action(action)

            assert hyp.frontier_node is None and hyp.frontier_field is None
            hyp.code = code_from_hyp = astor.to_source(
                asdl_ast_to_python_ast(hyp.tree, transition_system.grammar)).strip()
            # print(code_from_hyp)
            # print(canonical_code)
            assert code_from_hyp == canonical_code

            decanonicalized_code_from_hyp = decanonicalize_code(code_from_hyp,
                                                                example_dict['slot_map'])
            assert compare_ast(ast.parse(example_json['snippet']),
                               ast.parse(decanonicalized_code_from_hyp))
            assert transition_system.compare_ast(
                transition_system.surface_code_to_ast(decanonicalized_code_from_hyp),
                transition_system.surface_code_to_ast(example_json['snippet']))

            tgt_action_infos = get_action_infos(example_dict['intent_tokens'], tgt_actions)
        except (AssertionError, SyntaxError, ValueError, OverflowError) as e:
            skipped_list.append(example_json['question_id'])
            tqdm.write(
                f"Skipping example {example_json['question_id']} because of {type(e).__name__}:{e}"
            )
            continue
        example = Example(idx=f'{i}-{example_json["question_id"]}',
                          src_sent=example_dict['intent_tokens'],
                          tgt_actions=tgt_action_infos,
                          tgt_code=canonical_code,
                          tgt_ast=tgt_ast,
                          meta=dict(example_dict=example_json,
                                    slot_map=example_dict['slot_map']))
        assert evaluator.is_hyp_correct(example, hyp)

        examples.append(example)

        # Author: Gabe
        # Had to remove logging, when the log file would get too large, it would cause the
        # program to hang.

        # log!
        # f.write(f'Example: {example.idx}\n')
        # if 'rewritten_intent' in example.meta['example_dict']:
        #     f.write(f"Original Utterance: {example.meta['example_dict']['rewritten_intent']}\n")
        # else:
        #     f.write(f"Original Utterance: {example.meta['example_dict']['intent']}\n")
        # f.write(f"Original Snippet: {example.meta['example_dict']['snippet']}\n")
        # f.write(f"\n")
        # f.write(f"Utterance: {' '.join(example.src_sent)}\n")
        # f.write(f"Snippet: {example.tgt_code}\n")
        # f.write(f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

    f.close()
    print('Skipped due to exceptions: %d' % len(skipped_list), file=sys.stderr)
    return examples
コード例 #10
0
    def parse_natural_dataset(asdl_file_path,
                              max_query_len=70,
                              vocab_freq_cutoff=10):
        asdl_text = open(asdl_file_path).read()
        print('building grammar')
        grammar = ASDLGrammar.from_text(asdl_text)
        transition_system = Python3TransitionSystem(grammar)

        loaded_examples = []

        annotations = []
        codes = []
        path = os.path.join(os.path.dirname(__file__), "datagen")
        datagens = os.listdir(path)
        for folder in datagens:
            if "__" in folder or not os.path.isdir(os.path.join(path, folder)):
                continue
            with open(os.path.join(path, folder, "inputs.txt"), 'r') as file:
                annotations += file.read().split('\n')
            with open(os.path.join(path, folder, "outputs.txt"), 'r') as file:
                codes += file.read().split('\n')
        annotation_codes = list(zip(annotations, codes))
        np.random.seed(42)
        np.random.shuffle(annotation_codes)

        from components.vocab import Vocab, VocabEntry
        from components.dataset import Example

        print('processing examples')
        for idx, (src_query, tgt_code) in enumerate(annotation_codes):
            if (idx % 100 == 0):
                sys.stdout.write("\r%s / %s" % (idx, len(annotation_codes)))
                sys.stdout.flush()

            src_query = src_query.strip()
            tgt_code = tgt_code.strip()

            src_query_tokens, tgt_canonical_code, str_map = Natural.canonicalize_example(
                src_query, tgt_code)
            python_ast = ast.parse(tgt_canonical_code)  #.body[0]
            gold_source = astor.to_source(python_ast).strip()
            tgt_ast = python_ast_to_asdl_ast(python_ast,
                                             transition_system.grammar)
            tgt_actions = transition_system.get_actions(tgt_ast)
            # print('+' * 60)
            # print('Example: %d' % idx)
            # print('Source: %s' % ' '.join(src_query_tokens))
            # if str_map:
            #     print('Original String Map:')
            #     for str_literal, str_repr in str_map.items():
            #         print('\t%s: %s' % (str_literal, str_repr))
            # print('Code:\n%s' % gold_source)
            # print('Actions:')

            # sanity check
            hyp = Hypothesis()
            for t, action in enumerate(tgt_actions):
                assert action.__class__ in transition_system.get_valid_continuation_types(
                    hyp)
                if isinstance(action, ApplyRuleAction):
                    assert action.production in transition_system.get_valid_continuating_productions(
                        hyp)
                # assert action.__class__ in transition_system.get_valid_continuation_types(
                # hyp)

                p_t = -1
                f_t = None
                if hyp.frontier_node:
                    p_t = hyp.frontier_node.created_time
                    f_t = hyp.frontier_field.field.__repr__(plain=True)

                # print('\t[%d] %s, frontier field: %s, parent: %d' %
                #     (t, action, f_t, p_t))
                hyp = hyp.clone_and_apply_action(action)

            # assert hyp.frontier_node is None and hyp.frontier_field is None

            src_from_hyp = astor.to_source(
                asdl_ast_to_python_ast(hyp.tree, grammar)).strip()
            if "b'" not in str(gold_source) and 'b"' not in str(gold_source):
                assert src_from_hyp == gold_source

            # print('+' * 60)

            loaded_examples.append({
                'src_query_tokens': src_query_tokens,
                'tgt_canonical_code': gold_source,
                'tgt_ast': tgt_ast,
                'tgt_actions': tgt_actions,
                'raw_code': tgt_code,
                'str_map': str_map
            })

            # print('first pass, processed %d' % idx, file=sys.stderr)

        train_examples = []
        dev_examples = []
        test_examples = []

        action_len = []

        print("\nsplitting train/dev/test")
        for idx, e in enumerate(loaded_examples):
            src_query_tokens = e['src_query_tokens'][:max_query_len]
            tgt_actions = e['tgt_actions']
            tgt_action_infos = get_action_infos(src_query_tokens, tgt_actions)

            example = Example(idx=idx,
                              src_sent=src_query_tokens,
                              tgt_actions=tgt_action_infos,
                              tgt_code=e['tgt_canonical_code'],
                              tgt_ast=e['tgt_ast'],
                              meta={
                                  'raw_code': e['raw_code'],
                                  'str_map': e['str_map']
                              })

            # print('second pass, processed %d' % idx, file=sys.stderr)

            action_len.append(len(tgt_action_infos))

            # train, valid, test split
            total_examples = len(loaded_examples)
            split_size = np.ceil(total_examples * 0.05)
            (dev_split, test_split) = (total_examples - split_size * 2,
                                       total_examples - split_size)
            if 0 <= idx < dev_split:
                train_examples.append(example)
            elif dev_split <= idx < test_split:
                dev_examples.append(example)
            else:
                test_examples.append(example)

        print('Max action len: %d' % max(action_len), file=sys.stderr)
        print('Avg action len: %d' % np.average(action_len), file=sys.stderr)
        print('Actions larger than 100: %d' %
              len(list(filter(lambda x: x > 100, action_len))),
              file=sys.stderr)

        src_vocab = VocabEntry.from_corpus(
            [e.src_sent for e in train_examples],
            size=5000,
            freq_cutoff=vocab_freq_cutoff)

        primitive_tokens = [
            map(
                lambda a: a.action.token,
                filter(lambda a: isinstance(a.action, GenTokenAction),
                       e.tgt_actions)) for e in train_examples
        ]

        primitive_vocab = VocabEntry.from_corpus(primitive_tokens,
                                                 size=5000,
                                                 freq_cutoff=vocab_freq_cutoff)
        # assert '_STR:0_' in primitive_vocab

        # generate vocabulary for the code tokens!
        code_tokens = [
            tokenize_code(e.tgt_code, mode='decoder') for e in train_examples
        ]
        code_vocab = VocabEntry.from_corpus(code_tokens,
                                            size=5000,
                                            freq_cutoff=vocab_freq_cutoff)

        vocab = Vocab(source=src_vocab,
                      primitive=primitive_vocab,
                      code=code_vocab)
        print('generated vocabulary %s' % repr(vocab), file=sys.stderr)

        return (train_examples, dev_examples, test_examples), vocab