Ejemplo n.º 1
0
def encode_pattern(pattern, token_start, token_end, unknowns):

    current_pattern = b''
    for element in pattern.get_element_list():
        if not hasattr(element, 'items'):
            ## can be special element (string), or a PatternElement
            current_list = [(str(element),
                             getattr(element, 'get',
                                     lambda x, y: None)('level', None))]
        else:
            current_list = [(token_start, None)] + [
                (word, level) for level, word in element.items()
                if level in {'lemma', 'upos', 'deprel'}
            ] + [(token_end, None)]

        for element, level in current_list:
            ## just a quick fix - np_function needs to be handled differently
            if level == 'deprel':
                level = 'np_function'
            try:
                encoded_element = Base64Encoder.b64decode(element)
            except binascii.Error:
                ## set encoded_element to unknown
                encoded_element = Base64Encoder.b64decode(unknowns[level])

            current_pattern = HuffmanEncoder.combine(current_pattern,
                                                     encoded_element)

    return Base64Encoder.b64encode(current_pattern, binary=False)
Ejemplo n.º 2
0
def test_encode_vocabulary():

    infile_path = os.path.abspath(
        'example_data/example_data_dict_filtered.json')
    encoder_path = os.path.abspath('example_data/example_data_encoder')
    configfile_path = os.path.abspath('example_data/example_config.json')
    script_file = os.path.abspath('bin/encode_vocabulary')

    runner = CliRunner()
    with runner.isolated_filesystem():

        outfile = "example_data_dict_filtered_encoded.json"

        os.system(script_file + " " + infile_path + " " + outfile + " " +
                  encoder_path + " " + configfile_path)

        encoder = Base64Encoder(PatternEncoder.load(open(encoder_path, 'rb')))

        result_dict = json.load(open(outfile, 'r'))

        results = [(level, decoded, encoder.decode(encoded).get_element_list())
                   for level, elements in result_dict.items()
                   for decoded, encoded in elements.items()
                   if level != "__special__"]
        results = [
            len(pe) == 1 and pe[0].level == level and pe[0].form == word
            for level, word, pe in results
        ]

    assert all(results)
Ejemplo n.º 3
0
def test_create_encoder():

    infile_path = os.path.abspath(
        'example_data/example_data_dict_filtered.json')
    configfile_path = os.path.abspath('example_data/example_config.json')
    script_file = os.path.abspath('bin/create_encoder')

    runner = CliRunner()
    with runner.isolated_filesystem():

        outfile = "example_data_encoder"

        exit_status = os.system(script_file + " " + infile_path + " " +
                                outfile + " " + configfile_path)

        encoder = Base64Encoder(PatternEncoder.load(open(outfile, 'rb')))
        dict_ = json.load(open(infile_path, 'r'))

        pattern_elements = [
            PatternElement(word, level) for level, elements in dict_.items()
            for word in elements.keys()
        ]
        results = [
            encoder.decode(encoder.encode_item(pe)).get_element_list()[0] == pe
            for pe in pattern_elements
        ]

    assert all(results)
Ejemplo n.º 4
0
def decode_pattern_collection(ctx, infile, encoder, outfile, string, unknown):

    with open_file(encoder, 'rb') as encoder_file:
        pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file),
                                        binary=False)

    with open_file(infile) as infile:
        with open_file(outfile, 'w') as o:

            for line in infile:

                pattern, content = json.loads(line)
                decoded_pattern = pattern_encoder.decode(pattern)

                if string:
                    out_pattern = str(decoded_pattern)
                else:
                    out_pattern = base64.b64encode(
                        pickle.dumps(decoded_pattern)).decode('ascii')

                base_patterns = content.get('base_patterns', [])
                decoded_base_patterns = []
                for base_pattern in base_patterns:

                    try:
                        examples = []
                        if len(base_pattern) == 2:
                            examples = base_pattern[1]
                            base_pattern = base_pattern[0]

                        decoded_pattern = pattern_encoder.decode(base_pattern)

                        if unknown is None or all([
                                element != unknown for element in
                                decoded_pattern.get_element_list()
                        ]):
                            if string:
                                cout_pattern = str(decoded_pattern)
                            else:
                                cout_pattern = base64.b64encode(
                                    pickle.dumps(decoded_pattern)).decode(
                                        'ascii')

                            decoded_base_patterns.append(
                                [cout_pattern, examples])
                    except:
                        ctx.obj['logger'].warning(
                            "Could not test pattern for unknown, skipping.")

                content['base_patterns'] = decoded_base_patterns

                json.dump((out_pattern, content), o)
                o.write("\n")
Ejemplo n.º 5
0
def decode_patterns(ctx, infile, encoder, outfile, processes):

    with open_file(encoder, 'rb') as encoder_file:
        pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file),
                                        binary=False)

    with open_file(infile) as infile:
        with open_file(outfile, 'wb') as o:

            with MultiprocessMap(processes, chunksize=1000) as m:

                for pattern, decoded_pattern in m(
                        functools.partial(decode_pattern,
                                          pattern_encoder=pattern_encoder),
                        infile):

                    ctx.obj['logger'].info("Pattern")
                    pickle.dump((pattern, decoded_pattern), o)
Ejemplo n.º 6
0
        'fox': 0,
        'The': 2,
        'quick': 1,
        'brown': 3
    }}, SNGram),
    BitEncoder({'form': set(['fox', 'The', 'quick', 'brown'])}, SNGram),
    HuffmanEncoder({'form': {
        'fox': 5,
        'The': 10,
        'quick': 3,
        'brown': 8
    }}, SNGram),
    Base64Encoder(
        HuffmanEncoder({'form': {
            'fox': 5,
            'The': 10,
            'quick': 3,
            'brown': 8
        }}, SNGram)),
    Base64Encoder(HuffmanEncoder(
        {'form': {
            'fox': 5,
            'The': 10,
            'quick': 3,
            'brown': 8
        }}, SNGram),
                  binary=False)
]


@pytest.mark.parametrize("encoder", encoder)
Ejemplo n.º 7
0
def test_extract_patterns_with_phrases(parameters, expected_patterns,
                                       expected_basepatterns):

    infile_path = os.path.abspath('example_data/example_data_encoded.conllu')
    dictfile_path = os.path.abspath(
        'example_data/example_data_dict_filtered_encoded.json')

    encoder_path = os.path.abspath('example_data/example_data_encoder')

    runner = CliRunner()
    with runner.isolated_filesystem():

        patterns_list_filename = 'patterns_sorted.json'
        base_list_filename = 'base_sorted.json'

        patterns_filename = 'patterns.json'
        base_filename = 'base.json'

        runner.invoke(main, [
            'extract-patterns', infile_path, patterns_list_filename,
            base_list_filename, dictfile_path
        ] + parameters)

        # files need to be sorted
        for filename in [patterns_list_filename, base_list_filename]:
            with open(filename, 'r') as pattern_file:
                lines = [line.rstrip() for line in pattern_file]
                ## patterns list needs sorted with unique
                if filename == patterns_list_filename:
                    lines = set(lines)
                patterns = sorted(lines)

            with open(filename, 'w') as pattern_file:
                pattern_file.write('\n'.join(patterns) + '\n')

        with open(patterns_list_filename, 'r') as pattern_file:
            patterns = [line.rstrip() for line in pattern_file]
            print('\n'.join(patterns))

        runner.invoke(main, [
            'utils', 'convert-pattern-list', base_list_filename, base_filename
        ])

        runner.invoke(main, [
            'utils',
            'convert-pattern-list',
            patterns_list_filename,
            patterns_filename,
        ])

        patterns = {}
        for line in open(patterns_filename):
            pattern, base_patterns = json.loads(line)
            patterns[pattern] = base_patterns

        basepatterns = []
        for line in open(base_filename):
            basepatterns.append(json.loads(line))

        encoder = Base64Encoder(PatternEncoder.load(open(encoder_path, 'rb')))
        patterns = {
            str(encoder.decode(pattern)):
            set([str(encoder.decode(base)) for base in bases])
            for pattern, bases in patterns.items()
        }
        basepatterns = {
            str(encoder.decode(pattern)): content
            for pattern, content in basepatterns
        }

        assert patterns == expected_patterns

        assert basepatterns == expected_basepatterns