Ejemplo n.º 1
0
evaluate_parser.add_argument('-is_nbest', default=False, action='store_true')

# misc
parser.add_argument('-ifttt_test_split', default='data/ifff.test_data.gold.id')

# interactive operation
interactive_parser.add_argument('-mode', default='dataset')

if __name__ == '__main__':
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    np.random.seed(args.random_seed)
    init_logging(os.path.join(args.output_dir, 'parser.log'), logging.INFO)
    logging.info('command line: %s', ' '.join(sys.argv))

    logging.info('loading dataset [%s]', args.data)
    train_data, dev_data, test_data = deserialize_from_file(args.data)

    if not args.source_vocab_size:
        args.source_vocab_size = train_data.annot_vocab.size
    if not args.target_vocab_size:
        args.target_vocab_size = train_data.terminal_vocab.size
    if not args.rule_num:
        args.rule_num = len(train_data.grammar.rules)
    if not args.node_num:
        args.node_num = len(train_data.grammar.node_type_to_id)

    logging.info('current config: %s', args)
Ejemplo n.º 2
0
        if breakCamelStr:
            sub_tokens = re.sub(r'([a-z])([A-Z])', r'\1 #MERGE# \2',
                                tokval).split(' ')
            tokens.extend(sub_tokens)
        else:
            tokens.append(tokval)

        if toknum == tk.STRING:
            tokens.append(quote)

    return tokens


if __name__ == '__main__':
    from nn.utils.generic_utils import init_logging
    init_logging('misc.log')

    # django_code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    #
    # grammar, parse_trees = extract_grammar(django_code_file)
    # id = 1888
    # parse_tree = parse_trees[id]
    # print parse_tree
    # from components import Hyp
    # hyp = Hyp(grammar)
    # rules, rule_parents = parse_tree.get_productions()
    #
    # while hyp.frontier_nt():
    #     nt = hyp.frontier_nt()
    #     if grammar.is_value_node(nt):
    #         hyp.append_token('111<eos>')
Ejemplo n.º 3
0
    f_gold = open('data/ifff.test_data.gold.id', 'w')
    for url in lt_three_agree_with_gold:
        i = url2id[url]
        f_gold.write(str(i) + '\n')
    f_gold.close()

    f_gold = open('data/ifff.test_data.omit_unintelligible.id', 'w')
    for url in omit_unintelligible_examples:
        i = url2id[url]
        f_gold.write(str(i) + '\n')
    f_gold.close()

    f_gold = open('data/ifff.test_data.omit_non_english.id', 'w')
    for url in omit_non_english_examples:
        i = url2id[url]
        f_gold.write(str(i) + '\n')
    f_gold.close()

    omit_non_english_examples = [url2id[url] for url in omit_non_english_examples]
    omit_unintelligible_examples = [url2id[url] for url in omit_unintelligible_examples]
    lt_three_agree_with_gold = [url2id[url] for url in lt_three_agree_with_gold]

    return omit_non_english_examples, omit_unintelligible_examples, lt_three_agree_with_gold

if __name__ == '__main__':
    init_logging('ifttt.log')
    # parse_ifttt_dataset()
    # analyze_ifttt_dataset()
    extract_turk_data()
    # parse_data_for_seq2seq()
Ejemplo n.º 4
0
    for url in omit_unintelligible_examples:
        i = url2id[url]
        f_gold.write(str(i) + '\n')
    f_gold.close()

    f_gold = open('data/ifff.test_data.omit_non_english.id', 'w')
    for url in omit_non_english_examples:
        i = url2id[url]
        f_gold.write(str(i) + '\n')
    f_gold.close()

    omit_non_english_examples = [
        url2id[url] for url in omit_non_english_examples
    ]
    omit_unintelligible_examples = [
        url2id[url] for url in omit_unintelligible_examples
    ]
    lt_three_agree_with_gold = [
        url2id[url] for url in lt_three_agree_with_gold
    ]

    return omit_non_english_examples, omit_unintelligible_examples, lt_three_agree_with_gold


if __name__ == '__main__':
    init_logging('ifttt.log')
    # parse_ifttt_dataset()
    # analyze_ifttt_dataset()
    extract_turk_data()
    # parse_data_for_seq2seq()
Ejemplo n.º 5
0
        idx += 1

    f_annot.close()
    f_annot.close()

    # serialize_to_file(examples, 'django.cleaned.bin')

    print 'error num: %d' % err_num
    print 'preprocess_dataset: cleaned example num: %d' % len(examples)

    return examples


if __name__ == '__main__':
    from nn.utils.generic_utils import init_logging
    init_logging('parse.log')

    # annot_file = 'all.anno'
    # code_file = 'all.code'

    # preprocess_dataset(annot_file, code_file)

    # parse_django_dataset()
    # check_terminals()

    # print process_query(""" ALLOWED_VARIABLE_CHARS is a string 'abcdefgh"ijklm" nop"%s"qrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.'.""")

    # for i, query in enumerate(open('all.anno')):
    #     print i, process_query(query)

    # clean_dataset()
Ejemplo n.º 6
0
        idx += 1

    f_annot.close()
    f_annot.close()

    # serialize_to_file(examples, 'django.cleaned.bin')

    print 'error num: %d' % err_num
    print 'preprocess_dataset: cleaned example num: %d' % len(examples)

    return examples


if __name__== '__main__':
    from nn.utils.generic_utils import init_logging
    init_logging('parse.log')

    # annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno'
    # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'

    # preprocess_dataset(annot_file, code_file)

    # parse_django_dataset()
    # check_terminals()

    # print process_query(""" ALLOWED_VARIABLE_CHARS is a string 'abcdefgh"ijklm" nop"%s"qrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_.'.""")

    # for i, query in enumerate(open('/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno')):
    #     print i, process_query(query)

    # clean_dataset()
Ejemplo n.º 7
0
    ]:
        raw_ids = [
            int(i.strip())
            for i in open(os.path.join(config.ifttt_test_split), split)
        ]  # 'data/ifff.test_data.gold.id'
        eids = [
            i for i, e in enumerate(test_data.examples) if e.raw_id in raw_ids
        ]
        test_data_subset = test_data.get_dataset_by_ids(
            eids, test_data.name + '.' + split)

        from decoder import decode_ifttt_dataset
        decode_results = decode_ifttt_dataset(model,
                                              test_data_subset,
                                              verbose=True)
        evaluate_ifttt_results(test_data_subset, decode_results)


if __name__ == '__main__':
    from dataset import DataEntry, DataSet, Vocab, Action
    init_logging('parser.log', logging.INFO)

    train_data, dev_data, test_data = deserialize_from_file(
        'data/ifttt.freq3.bin')
    decoding_results = []
    for eid in range(test_data.count):
        example = test_data.examples[eid]
        decoding_results.append([(eid, example.parse_tree)])

    evaluate_ifttt_results(test_data, decoding_results, verbose=True)
Ejemplo n.º 8
0
            target_code = target_code.strip()
            tokenized_target = tokenize_code_adv(
                target_code,
                breakCamelStr=False if data_type == 'django' else True)
            tokenized_target = [
                tk.replace('\n', '#NEWLINE#') for tk in tokenized_target
            ]
            tokenized_target = [
                tk for tk in tokenized_target if tk is not None
            ]

            while tokenized_target[-1] == '#INDENT#':
                tokenized_target = tokenized_target[:-1]

            f_source.write(' '.join(query_tokens) + '\n')
            f_target.write(' '.join(tokenized_target) + '\n')

        f_source.close()
        f_target.close()


if __name__ == '__main__':
    init_logging('py.log')
    # rule_vs_node_stat()
    # process_heart_stone_dataset()
    parse_hs_dataset()
    # dump_data_for_evaluation(data_file='data/django.cleaned.dataset.freq5.par_info.refact.space_only.bin')
    # dump_data_for_evaluation(data_type='hs', data_file='data/hs.freq3.pre_suf.unary_closure.bin')
    # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    # py_grammar, _ = extract_grammar(code_file)
    # serialize_to_file(py_grammar, 'py_grammar.bin')
Ejemplo n.º 9
0
            tokenized_target = [
                tk for tk in tokenized_target if tk is not None
            ]

            while tokenized_target[-1] == '#INDENT#':
                tokenized_target = tokenized_target[:-1]

            f_source.write(' '.join(query_tokens) + '\n')
            f_target.write(' '.join(tokenized_target) + '\n')

        f_source.close()
        f_target.close()


if __name__ == '__main__':
    init_logging('sql.log')
    parser = argparse.ArgumentParser()
    parser.add_argument('-table_schema')
    parser.add_argument('-train_data')
    parser.add_argument('-train_data_ast')
    parser.add_argument('-train_data_size', type=int)
    parser.add_argument('-dev_data')
    parser.add_argument('-dev_data_ast')
    parser.add_argument('-dev_data_size', type=int)
    parser.add_argument('-test_data')
    parser.add_argument('-test_data_ast')
    # parser.add_argument('-test_data_size', type=int)
    parser.add_argument('-output_path')
    args = parser.parse_args()
    # parser.add_argument('-random_seed', default=181783, type=int)
    # parser.add_argument('-output_dir', default='.outputs')
Ejemplo n.º 10
0
            if data_type == 'django':
                target_code = de_canonicalize_code_for_seq2seq(code, e.meta_data['raw_code'])
            else:
                target_code = code

            # tokenize code
            target_code = target_code.strip()
            tokenized_target = tokenize_code_adv(target_code, breakCamelStr=False if data_type=='django' else True)
            tokenized_target = [tk.replace('\n', '#NEWLINE#') for tk in tokenized_target]
            tokenized_target = [tk for tk in tokenized_target if tk is not None]

            while tokenized_target[-1] == '#INDENT#':
                tokenized_target = tokenized_target[:-1]

            f_source.write(' '.join(query_tokens) + '\n')
            f_target.write(' '.join(tokenized_target) + '\n')

        f_source.close()
        f_target.close()


if __name__ == '__main__':
    init_logging('py.log')
    # rule_vs_node_stat()
    # process_heart_stone_dataset()
    parse_hs_dataset()
    # dump_data_for_evaluation(data_file='data/django.cleaned.dataset.freq5.par_info.refact.space_only.bin')
    # dump_data_for_evaluation(data_type='hs', data_file='data/hs.freq3.pre_suf.unary_closure.bin')
    # code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
    # py_grammar, _ = extract_grammar(code_file)
    # serialize_to_file(py_grammar, 'py_grammar.bin')
Ejemplo n.º 11
0
evaluate_parser.add_argument('-is_nbest', default=False, action='store_true')

# misc
parser.add_argument('-ifttt_test_split', default='data/ifff.test_data.gold.id')

# interactive operation
interactive_parser.add_argument('-mode', default='dataset')

if __name__ == '__main__':
    args = parser.parse_args()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    np.random.seed(args.random_seed)
    init_logging(os.path.join(args.output_dir, 'parser.log'), logging.INFO)
    logging.info('command line: %s', ' '.join(sys.argv))

    logging.info('loading dataset [%s]', args.data)
    train_data, dev_data, test_data = deserialize_from_file(args.data)

    if not args.source_vocab_size:
        args.source_vocab_size = train_data.annot_vocab.size
    if not args.target_vocab_size:
        args.target_vocab_size = train_data.terminal_vocab.size
    if not args.rule_num:
        args.rule_num = len(train_data.grammar.rules)
    if not args.node_num:
        args.node_num = len(train_data.grammar.node_type_to_id)

    logging.info('current config: %s', args)
Ejemplo n.º 12
0
    from decoder import decode_ifttt_dataset
    decode_results = decode_ifttt_dataset(model, test_data_subset, verbose=True)
    evaluate_ifttt_results(test_data_subset, decode_results)

    return decode_results


def decode_and_evaluate_ifttt_by_split(model, test_data):
    for split in ['ifff.test_data.omit_non_english.id', 'ifff.test_data.omit_unintelligible.id', 'ifff.test_data.gold.id']:
        raw_ids = [int(i.strip()) for i in open(os.path.join(config.ifttt_test_split), split)]  # 'data/ifff.test_data.gold.id'
        eids = [i for i, e in enumerate(test_data.examples) if e.raw_id in raw_ids]
        test_data_subset = test_data.get_dataset_by_ids(eids, test_data.name + '.' + split)

        from decoder import decode_ifttt_dataset
        decode_results = decode_ifttt_dataset(model, test_data_subset, verbose=True)
        evaluate_ifttt_results(test_data_subset, decode_results)


if __name__ == '__main__':
    from dataset import DataEntry, DataSet, Vocab, Action
    init_logging('parser.log', logging.INFO)

    train_data, dev_data, test_data = deserialize_from_file('data/ifttt.freq3.bin')
    decoding_results = []
    for eid in range(test_data.count):
        example = test_data.examples[eid]
        decoding_results.append([(eid, example.parse_tree)])

    evaluate_ifttt_results(test_data, decoding_results, verbose=True)