def test_serialization(self): with tempfile.NamedTemporaryFile(delete=False) as f: dictionary = Dictionary() dictionary.add_word(self.monkey) dictionary.add_word(self.silly) dictionary.add_word(self.hit) dictionary.save(storage=f.name) dictionary = Dictionary() dictionary.load(storage=f.name) self.assertEqual(dictionary.get_word(u'обезьянка').normalized, u'обезьянка') self.assertEqual(dictionary.get_word(u'глупый').normalized, u'глупый') self.assertEqual(dictionary.get_word(u'ударил').normalized, u'ударил')
def test_serialization(self): with tempfile.NamedTemporaryFile(delete=False) as f: dictionary = Dictionary() dictionary.add_word(self.monkey) dictionary.add_word(self.silly) dictionary.add_word(self.hit) dictionary.save(storage=f.name) dictionary = Dictionary() dictionary.load(storage=f.name) self.assertEqual( dictionary.get_word(u'обезьянка').normalized, u'обезьянка') self.assertEqual( dictionary.get_word(u'глупый').normalized, u'глупый') self.assertEqual( dictionary.get_word(u'ударил').normalized, u'ударил')
def import_texts(morph, source_dir, tech_vocabulary_path, voc_storage, dict_storage, tmp_dir='/tmp', check=False): from textgen.templates import Dictionary, Vocabulary, Template from textgen.words import WordBase vocabulary = Vocabulary() user_data = {'modules': {}} if not check: if os.path.exists(voc_storage): vocabulary.load(storage=voc_storage) dictionary = Dictionary() if os.path.exists(dict_storage): dictionary.load(storage=dict_storage) tech_vocabulary = get_tech_vocabulary(tech_vocabulary_path) for word in tech_vocabulary.keys(): word = WordBase.create_from_string(morph, word.strip(), tech_vocabulary) dictionary.add_word(word) for filename in os.listdir(source_dir): if not filename.endswith('.json'): continue texts_path = os.path.join(source_dir, filename) if not os.path.isfile(texts_path): continue group = filename[:-5] if check: check_path = os.path.join(tmp_dir, 'textgen-files-check-'+filename) if os.path.exists(check_path) and os.path.getmtime(check_path) > os.path.getmtime(texts_path): print 'group "%s" has been already processed' % group continue print 'load "%s"' % group with open(texts_path) as f: data = json.loads(f.read()) if group != data['prefix']: raise Exception('filename MUST be equal to prefix') for suffix in data['types']: if suffix == '': raise Exception('type MUST be not equal to empty string') user_data['modules'][data['prefix']] = get_user_data_for_module(data) variables_verbose = data['variables_verbose'] global_variables = data.get('variables', {}) for variable_name in global_variables.keys(): if not variables_verbose.get(variable_name): raise Exception('no verbose name for variable "%s"' % variable_name) for suffix, type_ in data['types'].items(): phrase_key = '%s_%s' % (group , suffix) vocabulary.register_type(phrase_key) if isinstance(type_, list): phrases = type_ local_variables = {} else: phrases = type_['phrases'] local_variables = type_.get('variables', {}) for variable_name in local_variables.keys(): if not variables_verbose.get(variable_name): raise Exception('no verbose name for variable "%s"' % variable_name) variables = copy.copy(global_variables) variables.update(local_variables) for phrase in phrases: template_phrase, test_phrase = phrase template = Template.create(morph, template_phrase, available_externals=variables.keys(), tech_vocabulary=tech_vocabulary) vocabulary.add_phrase(phrase_key, template) for value in variables.values(): if isinstance(value, numbers.Number): continue word = WordBase.create_from_string(morph, value, tech_vocabulary) dictionary.add_word(word) for string in template.get_internal_words(): word = WordBase.create_from_string(morph, string, tech_vocabulary) dictionary.add_word(word) test_result = template.substitute(dictionary, variables) test_result_normalized = efication(test_result) test_phrase_normalized = efication(test_phrase) if test_result_normalized != test_phrase_normalized: msg = None for i in xrange(min(len(test_result_normalized), len(test_phrase_normalized))): if test_result_normalized[i] != test_phrase_normalized[i]: msg = ''' wrong test_render for phrase "%s" prefix: "%s" diff: %s|%s''' % (template_phrase, test_result_normalized[:i], test_result_normalized[i], test_phrase_normalized[i]) break if msg is None: msg = 'different len: "%s"|"%s"' % (test_result_normalized[i:], test_phrase_normalized[i:]) raise TextgenException(msg) if check: with open(check_path, 'w') as f: f.write('1') if not check: vocabulary.save(storage=voc_storage) dictionary.save(storage=dict_storage) return user_data
def import_texts(morph, source_dir, tech_vocabulary_path, voc_storage, dict_storage, tmp_dir='/tmp', check=False): from textgen.templates import Dictionary, Vocabulary, Template from textgen.words import WordBase vocabulary = Vocabulary() user_data = {'modules': {}} if not check: if os.path.exists(voc_storage): vocabulary.load(storage=voc_storage) dictionary = Dictionary() if os.path.exists(dict_storage): dictionary.load(storage=dict_storage) tech_vocabulary = get_tech_vocabulary(tech_vocabulary_path) for word in tech_vocabulary.keys(): word = WordBase.create_from_string(morph, word.strip(), tech_vocabulary) dictionary.add_word(word) for filename in os.listdir(source_dir): if not filename.endswith('.json'): continue texts_path = os.path.join(source_dir, filename) if not os.path.isfile(texts_path): continue group = filename[:-5] if check: check_path = os.path.join(tmp_dir, 'textgen-files-check-' + filename) if os.path.exists(check_path) and os.path.getmtime( check_path) > os.path.getmtime(texts_path): print 'group "%s" has been already processed' % group continue print 'load "%s"' % group with open(texts_path) as f: data = json.loads(f.read()) if group != data['prefix']: raise Exception('filename MUST be equal to prefix') for suffix in data['types']: if suffix == '': raise Exception('type MUST be not equal to empty string') user_data['modules'][data['prefix']] = get_user_data_for_module( data) variables_verbose = data['variables_verbose'] global_variables = data.get('variables', {}) for variable_name in global_variables.keys(): if not variables_verbose.get(variable_name): raise Exception('no verbose name for variable "%s"' % variable_name) for suffix, type_ in data['types'].items(): phrase_key = '%s_%s' % (group, suffix) vocabulary.register_type(phrase_key) if isinstance(type_, list): phrases = type_ local_variables = {} else: phrases = type_['phrases'] local_variables = type_.get('variables', {}) for variable_name in local_variables.keys(): if not variables_verbose.get(variable_name): raise Exception('no verbose name for variable "%s"' % variable_name) variables = copy.copy(global_variables) variables.update(local_variables) for phrase in phrases: template_phrase, test_phrase = phrase template = Template.create( morph, template_phrase, available_externals=variables.keys(), tech_vocabulary=tech_vocabulary) vocabulary.add_phrase(phrase_key, template) for value in variables.values(): if isinstance(value, numbers.Number): continue word = WordBase.create_from_string( morph, value, tech_vocabulary) dictionary.add_word(word) for string in template.get_internal_words(): word = WordBase.create_from_string( morph, string, tech_vocabulary) dictionary.add_word(word) test_result = template.substitute(dictionary, variables) test_result_normalized = efication(test_result) test_phrase_normalized = efication(test_phrase) if test_result_normalized != test_phrase_normalized: msg = None for i in xrange( min(len(test_result_normalized), len(test_phrase_normalized))): if test_result_normalized[ i] != test_phrase_normalized[i]: msg = ''' wrong test_render for phrase "%s" prefix: "%s" diff: %s|%s''' % (template_phrase, test_result_normalized[:i], test_result_normalized[i], test_phrase_normalized[i]) break if msg is None: msg = 'different len: "%s"|"%s"' % ( test_result_normalized[i:], test_phrase_normalized[i:]) raise TextgenException(msg) if check: with open(check_path, 'w') as f: f.write('1') if not check: vocabulary.save(storage=voc_storage) dictionary.save(storage=dict_storage) return user_data