def test_read_stop_words(self): corpus = Corpus('') stopwords_file = StringIO('translator-credits') assert corpus._should_select_string(u'translator-credits', '*****@*****.**') corpus._read_stop_words(stopwords_file) assert not corpus._should_select_string(u'translator-credits', '*****@*****.**')
def process_projects(src_directory, glossary_description, glossary_file): corpus = Corpus(src_directory) corpus.process() reference_sources = ReferenceSources() reference_sources.read_sources() metrics = Metrics() metrics.create(corpus) # Select terms MAX_TERMS = 8000 sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get, reverse=True) # Developer report glossary_entries = OrderedDict() translations = Translations() selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency for term in selected_terms: glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) dev_glossary_serializer = DevGlossarySerializer() dev_glossary_serializer.create(u"dev-" + glossary_file + ".html", glossary_description, corpus, glossary_entries, reference_sources) # User report glossary_entries = [] selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS]) # Sorted by term glossary = Glossary(glossary_description) for term in selected_terms: glossary_entry = GlossaryEntry( term, translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) ) glossary.entries.append(glossary_entry) glossary_entries = glossary.get_dict() process_template('terminology/templates/userglossary-html.mustache', glossary_file + ".html", glossary_entries) process_template('terminology/templates/userglossary-csv.mustache', glossary_file + ".csv", glossary_entries) generate_database(glossary, glossary_file)
def test_should_select_string_nonumericalonly(self): corpus = Corpus('') assert not corpus._should_select_string(u'10', '10') assert corpus._should_select_string(u'10 minutes ago', 'Fa 10 minuts')
def test_should_select_string_noformatters(self): corpus = Corpus('') assert not corpus._should_select_string(u'Usage: %s', 'Ús: %s') assert corpus._should_select_string(u'Usage: sample', 'Ús: exemple')
def test_should_select_string_nospaces(self): corpus = Corpus('') assert not corpus._should_select_string(u'accessibility;development;test;', 'accessibility;development;test;')
def test_should_select_string_notags(self): corpus = Corpus('') assert not corpus._should_select_string(u'<b>_User name</b>', '<b>_Nom d\'usuari</b>') assert corpus._should_select_string(u'User name', '_Nom d\'usuari')
def test_clean_strings(self): corpus = Corpus('') assert corpus._clean_string(u'_Hard Disk') == u'hard disk' assert corpus._clean_string(u'Contrasen&ya:') == u'contrasenya' assert corpus._clean_string(u'All ~Pages') == u'all pages' assert corpus._clean_string(u'Properties...') == u'properties'
def test_should_select_string_empty_target(self): corpus = Corpus('') assert not corpus._should_select_string(u'()', '()')
def test_clean_localized(self): corpus = Corpus('') assert corpus._clean_localized(u'accès') == u'accès' assert corpus._clean_localized(u'àíóè’') == u'àíóè\''
def test_should_not_select_parentesis_only(self): corpus = Corpus('') assert not corpus._should_select_string(u'()', '()')
def test_should_select_string_empty_target(self): corpus = Corpus('') assert corpus._should_select_string(u'This week', 'Aquesta setmana') assert not corpus._should_select_string(u'This week', '')
def test_should_select_string_nospaces(self): corpus = Corpus('') assert not corpus._should_select_string( u'accessibility;development;test;', 'accessibility;development;test;')