def post(self): parser.add_argument('salto_texts_list', action='append') args = parser.parse_args() st_texts_list = args['salto_texts_list'] kw_list = [] script_folder, script_name = os.path.split(os.path.abspath(__file__)) output_folder_name = os.path.abspath(script_folder) logFile = os.path.join(output_folder_name, script_name + ".log") logging.basicConfig(filename=logFile, level=logging.WARNING) text_number = 0 for st in st_texts_list: text_number += 1 outputDirectory = os.path.join("/opt/keywords-extractor/keywords/", "temp_folder_" + str(text_number)) make_output_directory(outputDirectory) st_u8 = st.encode('utf8') key_word_extractor = KeywordExtractor(st_u8, outputDirectory) key_words_set = key_word_extractor.extract_keywords() kw_list.append(list(key_words_set)) #Remove the directory containing the output files shutil.rmtree(outputDirectory) return {'kw': kw_list}
def test_it_extract_keywords_10(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10152.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual(key_words_set, { 'incidente', 'contadino', 'investito da un trattore', 'Kuppelwies' })
def test_it_fill_dictionaries_with_treetagger(self): """ Tests the class method of the KeywordExtractor _fill_dictionaries_with_treetagger. Passes an Italian sentence to this method and checks the content of dictionaries and sets filled by it. """ kw_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "small-mixed2.txt", self.output_folder) kw_extractor._fill_dictionaries_with_treetagger( 'Ieri il primario di Casa Basaglia Lorenzo Toresini è andato in pensione.', 1.0, ('il', 'di', 'essere', 'casa', 'in')) self.assertEqual( kw_extractor.noun_lemma_dict, { 'primario': 1.0, 'Basaglia': 1.0, 'Lorenzo': 1.0, 'Toresini': 1.0, 'pensione': 1.0, 'andare': 1.0 }) self.assertEqual( kw_extractor.token_to_lemma_dict_original_case, { 'Basaglia': 'basaglia', 'Casa': 'casa', 'Ieri': 'ieri', 'Lorenzo': 'lorenzo', 'Toresini': 'toresini', 'andato': 'andare', 'di': 'di', 'il': 'il', 'in': 'in', 'pensione': 'pensione', 'primario': 'primario', 'è': 'essere' }) self.assertEqual( kw_extractor.lemma_token_to_POS, { 'Basaglia': {'NOM'}, 'Casa': {'NPR'}, 'Ieri': {'ADV'}, 'Lorenzo': {'NPR'}, 'Toresini': {'NOM'}, 'andare': {'VER:pper'}, 'andato': {'VER:pper'}, 'di': {'PRE'}, 'essere': {'VER:pres'}, 'il': {'DET:def'}, 'in': {'PRE'}, 'pensione': {'NOM'}, 'primario': {'NOM'}, 'è': {'VER:pres'} }) self.assertEqual(kw_extractor.title_noun_lemmas_dict, {}) self.assertEqual(kw_extractor.tree_taggers_proper_nouns, {'Basaglia', 'Lorenzo', 'Toresini'})
def test_de_extract_keywords_30(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21870.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Greta Marcolongo', 'Live-Musik', 'Andrea Maffei', 'Fußball-Übertragungen' })
def test_it_extract_keywords_4(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "1028.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'dialogo', 'Richard Theiner', 'Svp', 'autonomia integrale', 'sorriso degli italiani' })
def test_it_extract_keywords_13(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10187.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Cisl Alto Adige', 'imposta sul valore aggiunto', "aumento dell'Iva", 'euro', 'Freiheitlichen' })
def test_it_extract_keywords_11(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10157.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'politico', 'Andreas Perugini', 'MoVimento 5 Stelle', 'presentato', 'programma', 'estromissione' })
def test_it_extract_keywords_8(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10046.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Alessandro Vicentini', 'Bolzano', 'È una questione strutturale', 'La crisi morde', 'auto' })
def test_de_extract_keywords_52(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "22051.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Mädchen', 'Heterobaby', 'zwanzigminütigen Kurzfilm', 'Welt', 'homosexuellen Menschen', 'Jungs' })
def test_it_extract_keywords_20(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10275.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Karl Zeller', 'Svp', 'Eva Klotz', "accordo", 'rifugio', 'tedesco', 'provocazione', 'funzionario', 'mozione', 'difenda' })
def test_de_extract_keywords_35(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21921.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Karl Zeller', 'SVP-Senator', 'Abänderungsantrag', 'RAI-Sitze der sprachlichen Minderheiten' })
def test_de_extract_keywords_32(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21905.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Sexarbeiterin', 'Day', 'Fotostrecke', 'Problem', 'Xenia', 'internationale Hurentag' })
def test_de_extract_keywords_47(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "22003.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Freiheitlichen', 'SVP und PD', 'Volksabstimmung', 'Seilbahnprojekt', 'Brixen' })
def test_de_extract_keywords_48(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "22008.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Politikerrenten', 'Landeshauptmann Arno Kompatscher', 'Freiheitlichen', 'Südtiroler Frühling' })
def test_de_extract_keywords_43(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21984.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Andreas Pöder', 'Regionalratsabgeordnete', 'Movimento 5 Stelle', 'Paul Köllensperger' })
def test_de_extract_keywords_44(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21986.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'René Benkos', 'Busbahnhofsareals', 'Willi Hüsler', 'Erlebnishaus Südtirol', 'Kaufhausprojekt', 'Boris Podrecca' })
def test_de_extract_keywords_40(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21965.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Freiheitlichen', 'Stocker', 'Landtagswahlen', 'Wählerstimme', 'Pius', 'Ulli', 'SVP' })
def test_de_extract_keywords_53(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "22056.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Frau', 'schwer', 'Kuh', 'Franz', 'Jungbäuerin', 'Mölten', 'Sattlerhüttenwirt' })
def test_detect_german_in_italian(self): kw_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "small-mixed.txt", self.output_folder) self.assertEqual( kw_extractor._detect_german_in_italian( 'Bernardo Magnagi dice spesso: Gesundheit und Danke.'), "de") self.assertEqual( kw_extractor._detect_german_in_italian( 'Heinrich Hund dice spesso: mio dio!'), "it")
def test_it_extract_keywords_24(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10293.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Giunta comunale', 'Bolzano', 'Mercatino di Natale', 'turismo', 'Alto Adige', 'gennaio', 'ambientalista' })
def test_it_extract_keywords_22(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10288.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Bizzo', 'tedesco', 'italiano', 'quotidiano', 'Innovation Festival', 'Alto Adige' })
def test_it_extract_keywords_15(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10212.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Bolzano', 'quartiere', 'partecipante', 'bolzanobici around the world', 'dedicata alle due ruote', 'popolare', 'iniziativa' })
def test_de_extract_keywords_50(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "22024.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertIn('rechtextreme Bewegung', key_words_set) self.assertIn('rechtsradikale Bewegung', key_words_set) self.assertIn('DIGOS-Ermittlungen', key_words_set) self.assertIn('Luigi Spagnoli', key_words_set) self.assertIn('legge Scelba', key_words_set) self.assertIn('Socialismo Nazionale', key_words_set)
def test_de_extract_keywords_41(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "21969.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Bewerbungsdossiers', 'Giorgio Orsoni', 'Kulturhauptstadtregion', 'Christian Tommasini', 'Alberto Stenico', 'Luis Durnwalder', 'Venedig', 'Nordest' })
def test_it_extract_keywords_16(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10225.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'pulizia linguistica', 'Florian Kronbichler', 'quota Svp', 'polemica', 'risolve', 'teoria', 'soluzione', 'rifugi di montagna', 'ribadisce' })
def test_it_extract_keywords(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "1008-it.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertTrue('pensione' in key_words_set) self.assertTrue('struttura' in key_words_set) self.assertTrue('don Chisciotte' in key_words_set) self.assertTrue('Franco Basaglia' in key_words_set) self.assertTrue('Lorenzo Toresini' in key_words_set)
def test_it_extract_keywords_21(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10286.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'Arno Kompatscher', 'tre domande', 'entrature particolari', 'il prossimo Landeshauptmann', 'democrazia diretta', 'firma', 'legge provinciale', 'Svp' })
def test_it_extract_keywords_28(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10328.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'studio Besier', 'comitato proALTvor', 'Tiziana Campagnoli', 'Bressanone', 'Provincia', 'Comune', 'Stephan Besier', 'tecnici', 'incontro', 'interrompe', 'ancora valido', 'sindaco' })
def test_it_extract_keywords_9(self): key_word_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "10053.txt", self.output_folder) key_words_set = key_word_extractor.extract_keywords() self.assertEqual( key_words_set, { 'dossier', 'Bolzano', 'sito', 'Venezia e Nordest', 'capitale europea della cultura', 'candidatura', 'caldo involucro', 'raccomandazione' })
def test_de_fill_dictionaries_with_treetagger(self): """ Tests the class method of the KeywordExtractor _fill_dictionaries_with_treetagger. Passes a German sentence to this method and checks the content of dictionaries and sets filled by it. """ kw_extractor = KeywordExtractor( os.path.join(self.script_folder, "test"), "small-mixed.txt", self.output_folder) kw_extractor._fill_dictionaries_with_treetagger( 'Antworten auf diese Fragen gab es aus dem Passeiertal bereits einige.', 1.0, ('auf', 'dem', 'es', 'aus')) self.assertEqual(kw_extractor.noun_lemma_dict, { 'Antwort': 1.0, 'Frage': 1.0, 'Passeiertal': 1.0, 'geben': 1.0 }) self.assertEqual( kw_extractor.token_to_lemma_dict_original_case, { 'Fragen': 'frage', 'bereits': 'bereits', 'es': 'es', 'aus': 'aus', 'gab': 'geben', 'auf': 'auf', 'Passeiertal': 'passeiertal', 'Antworten': 'antwort', 'diese': 'dies', 'einige': 'einige', 'dem': 'die' }) self.assertEqual( kw_extractor.lemma_token_to_POS, { 'Frage': {'NN'}, 'die': {'ART'}, 'bereits': {'ADV'}, 'einige': {'PIS'}, 'Antworten': {'NN'}, 'auf': {'APPR'}, 'diese': {'PDAT'}, 'dem': {'ART'}, 'Fragen': {'NN'}, 'geben': {'VVFIN'}, 'Antwort': {'NN'}, 'aus': {'APPR'}, 'gab': {'VVFIN'}, 'Passeiertal': {'NN'}, 'dies': {'PDAT'}, 'es': {'PPER'} }) self.assertEqual(kw_extractor.title_noun_lemmas_dict, {}) self.assertEqual(kw_extractor.tree_taggers_proper_nouns, set())