Ejemplo n.º 1
0
                    test_file = f_n
        assert test_file is not None

        cat = 'other'
        if 'Chinese' in path or 'Japanese' in path:
            cat = 'zh'
        for line in codecs.open(path + '/' + test_file, 'r', encoding='utf-8'):
            if len(line) < 2:
                break
            if '# sentence' in line or '# text' in line:
                cat = 'gold'
        reader.get_raw(path, test_file, 'raw_test.txt', cat, form=args.format)

        raws_test = reader.raw(path + '/raw_test.txt')
        test_y_gold = reader.test_gold(path + '/' + test_file,
                                       form=args.format,
                                       is_space=is_space,
                                       ignore_mwt=args.ignore_mwt)

        new_chars = toolbox.get_new_chars(path + '/raw_test.txt', char2idx,
                                          is_space)

        if emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(),
                                                  emb_path)
        else:
            valid_chars = None

        char2idx, idx2char, unk_chars_idx, sub_dict = toolbox.update_char_dict(
            char2idx, new_chars, unk_chars_idx, valid_chars)

        test_x, max_len_test = toolbox.get_input_vec_raw(
Ejemplo n.º 2
0
                    test_file = f_n
        assert test_file is not None

        cat = 'other'
        if 'Chinese' in path or 'Japanese' in path:
            cat = 'zh'
        for line in codecs.open(path + '/' + test_file, 'r', encoding='utf-8'):
            if len(line) < 2:
                break
            if '# sentence' in line or '# text' in line:
                cat = 'gold'
        reader.get_raw(path, test_file, 'raw_test.txt', cat, form=args.format)

        raws_test = reader.raw(path + '/raw_test.txt')
        test_y_gold = reader.test_gold(path + '/' + test_file,
                                       form=args.format,
                                       is_space=is_space)

        new_chars = toolbox.get_new_chars(path + '/raw_test.txt', char2idx,
                                          is_space)

        if emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(),
                                                  emb_path)
        else:
            valid_chars = None

        char2idx, idx2char, unk_chars = toolbox.update_char_dict(
            char2idx, new_chars, unk_chars, valid_chars)

        test_x, max_len_test = toolbox.get_input_vec_raw(
Ejemplo n.º 3
0
        for line in codecs.open(test_language_dir + '/' + test_file,
                                'r',
                                encoding='utf-8'):
            if len(line) < 2:
                break
            if '# sentence' in line or '# text' in line:
                cat = 'gold'
        reader.get_raw(test_language_dir,
                       test_file,
                       'raw_test.txt',
                       cat,
                       form=args.format)

        raws_test = reader.raw(test_language_dir + '/raw_test.txt')
        test_y_gold = reader.test_gold(test_language_dir + '/' + test_file,
                                       form=args.format,
                                       is_space=is_space,
                                       ignore_mwt=args.ignore_mwt)

        new_chars = toolbox.get_new_chars(test_language_dir + '/raw_test.txt',
                                          char2idx, is_space)

        if emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(),
                                                  emb_path)
        else:
            valid_chars = None

        char2idx, idx2char, unk_chars_idx, sub_dict = toolbox.update_char_dict(
            char2idx, new_chars, unk_chars_idx, valid_chars)

        test_x1, test_x2, max_len_test = toolbox.get_input_vec_raw_test_new(