Ejemplo n.º 1
0
    def test_serialization(self):

        with tempfile.NamedTemporaryFile(delete=False) as f:
            dictionary = Dictionary()

            dictionary.add_word(self.monkey)
            dictionary.add_word(self.silly)
            dictionary.add_word(self.hit)

            dictionary.save(storage=f.name)

            dictionary = Dictionary()
            dictionary.load(storage=f.name)

            self.assertEqual(dictionary.get_word(u'обезьянка').normalized, u'обезьянка')
            self.assertEqual(dictionary.get_word(u'глупый').normalized, u'глупый')
            self.assertEqual(dictionary.get_word(u'ударил').normalized, u'ударил')
Ejemplo n.º 2
0
    def test_serialization(self):

        with tempfile.NamedTemporaryFile(delete=False) as f:
            dictionary = Dictionary()

            dictionary.add_word(self.monkey)
            dictionary.add_word(self.silly)
            dictionary.add_word(self.hit)

            dictionary.save(storage=f.name)

            dictionary = Dictionary()
            dictionary.load(storage=f.name)

            self.assertEqual(
                dictionary.get_word(u'обезьянка').normalized, u'обезьянка')
            self.assertEqual(
                dictionary.get_word(u'глупый').normalized, u'глупый')
            self.assertEqual(
                dictionary.get_word(u'ударил').normalized, u'ударил')
Ejemplo n.º 3
0
def import_texts(morph, source_dir, tech_vocabulary_path, voc_storage, dict_storage, tmp_dir='/tmp', check=False):
    from textgen.templates import Dictionary, Vocabulary, Template
    from textgen.words import WordBase

    vocabulary = Vocabulary()

    user_data = {'modules': {}}

    if not check:
        if os.path.exists(voc_storage):
            vocabulary.load(storage=voc_storage)

    dictionary = Dictionary()
    if os.path.exists(dict_storage):
        dictionary.load(storage=dict_storage)

    tech_vocabulary = get_tech_vocabulary(tech_vocabulary_path)

    for word in tech_vocabulary.keys():
        word = WordBase.create_from_string(morph, word.strip(), tech_vocabulary)
        dictionary.add_word(word)

    for filename in os.listdir(source_dir):

        if not filename.endswith('.json'):
            continue

        texts_path = os.path.join(source_dir, filename)

        if not os.path.isfile(texts_path):
            continue

        group = filename[:-5]

        if check:
            check_path = os.path.join(tmp_dir, 'textgen-files-check-'+filename)

            if os.path.exists(check_path) and os.path.getmtime(check_path) > os.path.getmtime(texts_path):
                print 'group "%s" has been already processed' % group
                continue

        print 'load "%s"' % group

        with open(texts_path) as f:
            data = json.loads(f.read())

            if group != data['prefix']:
                raise Exception('filename MUST be equal to prefix')

            for suffix in data['types']:
                if suffix == '':
                    raise Exception('type MUST be not equal to empty string')

            user_data['modules'][data['prefix']] = get_user_data_for_module(data)

            variables_verbose = data['variables_verbose']

            global_variables = data.get('variables', {})

            for variable_name in global_variables.keys():
                if not variables_verbose.get(variable_name):
                    raise Exception('no verbose name for variable "%s"' % variable_name)

            for suffix, type_ in data['types'].items():
                phrase_key = '%s_%s' % (group , suffix)

                vocabulary.register_type(phrase_key)

                if isinstance(type_, list):
                    phrases = type_
                    local_variables = {}
                else:
                    phrases = type_['phrases']
                    local_variables = type_.get('variables', {})

                for variable_name in local_variables.keys():
                    if not variables_verbose.get(variable_name):
                        raise Exception('no verbose name for variable "%s"' % variable_name)

                variables = copy.copy(global_variables)
                variables.update(local_variables)

                for phrase in phrases:
                    template_phrase, test_phrase = phrase

                    template = Template.create(morph, template_phrase, available_externals=variables.keys(), tech_vocabulary=tech_vocabulary)

                    vocabulary.add_phrase(phrase_key, template)

                    for value in variables.values():
                        if isinstance(value, numbers.Number):
                            continue
                        word = WordBase.create_from_string(morph, value, tech_vocabulary)
                        dictionary.add_word(word)

                    for string in template.get_internal_words():
                        word = WordBase.create_from_string(morph, string, tech_vocabulary)
                        dictionary.add_word(word)

                    test_result = template.substitute(dictionary, variables)

                    test_result_normalized = efication(test_result)
                    test_phrase_normalized = efication(test_phrase)

                    if test_result_normalized != test_phrase_normalized:
                        msg = None
                        for i in xrange(min(len(test_result_normalized), len(test_phrase_normalized))):
                            if test_result_normalized[i] != test_phrase_normalized[i]:
                                msg = '''
wrong test_render for phrase "%s"

prefix: "%s"

diff: %s|%s''' % (template_phrase, test_result_normalized[:i], test_result_normalized[i], test_phrase_normalized[i])
                                break

                        if msg is None:
                            msg = 'different len: "%s"|"%s"' % (test_result_normalized[i:], test_phrase_normalized[i:])

                        raise TextgenException(msg)

        if check:
            with open(check_path, 'w') as f:
                f.write('1')

    if not check:
        vocabulary.save(storage=voc_storage)
        dictionary.save(storage=dict_storage)

    return user_data
Ejemplo n.º 4
0
def import_texts(morph,
                 source_dir,
                 tech_vocabulary_path,
                 voc_storage,
                 dict_storage,
                 tmp_dir='/tmp',
                 check=False):
    from textgen.templates import Dictionary, Vocabulary, Template
    from textgen.words import WordBase

    vocabulary = Vocabulary()

    user_data = {'modules': {}}

    if not check:
        if os.path.exists(voc_storage):
            vocabulary.load(storage=voc_storage)

    dictionary = Dictionary()
    if os.path.exists(dict_storage):
        dictionary.load(storage=dict_storage)

    tech_vocabulary = get_tech_vocabulary(tech_vocabulary_path)

    for word in tech_vocabulary.keys():
        word = WordBase.create_from_string(morph, word.strip(),
                                           tech_vocabulary)
        dictionary.add_word(word)

    for filename in os.listdir(source_dir):

        if not filename.endswith('.json'):
            continue

        texts_path = os.path.join(source_dir, filename)

        if not os.path.isfile(texts_path):
            continue

        group = filename[:-5]

        if check:
            check_path = os.path.join(tmp_dir,
                                      'textgen-files-check-' + filename)

            if os.path.exists(check_path) and os.path.getmtime(
                    check_path) > os.path.getmtime(texts_path):
                print 'group "%s" has been already processed' % group
                continue

        print 'load "%s"' % group

        with open(texts_path) as f:
            data = json.loads(f.read())

            if group != data['prefix']:
                raise Exception('filename MUST be equal to prefix')

            for suffix in data['types']:
                if suffix == '':
                    raise Exception('type MUST be not equal to empty string')

            user_data['modules'][data['prefix']] = get_user_data_for_module(
                data)

            variables_verbose = data['variables_verbose']

            global_variables = data.get('variables', {})

            for variable_name in global_variables.keys():
                if not variables_verbose.get(variable_name):
                    raise Exception('no verbose name for variable "%s"' %
                                    variable_name)

            for suffix, type_ in data['types'].items():
                phrase_key = '%s_%s' % (group, suffix)

                vocabulary.register_type(phrase_key)

                if isinstance(type_, list):
                    phrases = type_
                    local_variables = {}
                else:
                    phrases = type_['phrases']
                    local_variables = type_.get('variables', {})

                for variable_name in local_variables.keys():
                    if not variables_verbose.get(variable_name):
                        raise Exception('no verbose name for variable "%s"' %
                                        variable_name)

                variables = copy.copy(global_variables)
                variables.update(local_variables)

                for phrase in phrases:
                    template_phrase, test_phrase = phrase

                    template = Template.create(
                        morph,
                        template_phrase,
                        available_externals=variables.keys(),
                        tech_vocabulary=tech_vocabulary)

                    vocabulary.add_phrase(phrase_key, template)

                    for value in variables.values():
                        if isinstance(value, numbers.Number):
                            continue
                        word = WordBase.create_from_string(
                            morph, value, tech_vocabulary)
                        dictionary.add_word(word)

                    for string in template.get_internal_words():
                        word = WordBase.create_from_string(
                            morph, string, tech_vocabulary)
                        dictionary.add_word(word)

                    test_result = template.substitute(dictionary, variables)

                    test_result_normalized = efication(test_result)
                    test_phrase_normalized = efication(test_phrase)

                    if test_result_normalized != test_phrase_normalized:
                        msg = None
                        for i in xrange(
                                min(len(test_result_normalized),
                                    len(test_phrase_normalized))):
                            if test_result_normalized[
                                    i] != test_phrase_normalized[i]:
                                msg = '''
wrong test_render for phrase "%s"

prefix: "%s"

diff: %s|%s''' % (template_phrase, test_result_normalized[:i],
                                test_result_normalized[i], test_phrase_normalized[i])
                                break

                        if msg is None:
                            msg = 'different len: "%s"|"%s"' % (
                                test_result_normalized[i:],
                                test_phrase_normalized[i:])

                        raise TextgenException(msg)

        if check:
            with open(check_path, 'w') as f:
                f.write('1')

    if not check:
        vocabulary.save(storage=voc_storage)
        dictionary.save(storage=dict_storage)

    return user_data