Beispiel #1
0
    def post(self):
        parser.add_argument('salto_texts_list', action='append')
        args = parser.parse_args()
        st_texts_list = args['salto_texts_list']
        kw_list = []

        script_folder, script_name = os.path.split(os.path.abspath(__file__))

        output_folder_name = os.path.abspath(script_folder)

        logFile = os.path.join(output_folder_name, script_name + ".log")
        logging.basicConfig(filename=logFile, level=logging.WARNING)

        text_number = 0
        for st in st_texts_list:
            text_number += 1
            outputDirectory = os.path.join("/opt/keywords-extractor/keywords/",
                                           "temp_folder_" + str(text_number))

            make_output_directory(outputDirectory)
            st_u8 = st.encode('utf8')
            key_word_extractor = KeywordExtractor(st_u8, outputDirectory)
            key_words_set = key_word_extractor.extract_keywords()

            kw_list.append(list(key_words_set))

            #Remove the directory containing the output files
            shutil.rmtree(outputDirectory)

        return {'kw': kw_list}
Beispiel #2
0
 def test_it_extract_keywords_10(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10152.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(key_words_set, {
         'incidente', 'contadino', 'investito da un trattore', 'Kuppelwies'
     })
Beispiel #3
0
    def test_it_fill_dictionaries_with_treetagger(self):
        """
        Tests the class method of the KeywordExtractor _fill_dictionaries_with_treetagger.
        Passes an Italian sentence to this method and checks the content of dictionaries and sets filled by it.
        """
        kw_extractor = KeywordExtractor(
            os.path.join(self.script_folder, "test"), "small-mixed2.txt",
            self.output_folder)
        kw_extractor._fill_dictionaries_with_treetagger(
            'Ieri il primario di Casa Basaglia Lorenzo Toresini è andato in pensione.',
            1.0, ('il', 'di', 'essere', 'casa', 'in'))

        self.assertEqual(
            kw_extractor.noun_lemma_dict, {
                'primario': 1.0,
                'Basaglia': 1.0,
                'Lorenzo': 1.0,
                'Toresini': 1.0,
                'pensione': 1.0,
                'andare': 1.0
            })
        self.assertEqual(
            kw_extractor.token_to_lemma_dict_original_case, {
                'Basaglia': 'basaglia',
                'Casa': 'casa',
                'Ieri': 'ieri',
                'Lorenzo': 'lorenzo',
                'Toresini': 'toresini',
                'andato': 'andare',
                'di': 'di',
                'il': 'il',
                'in': 'in',
                'pensione': 'pensione',
                'primario': 'primario',
                'è': 'essere'
            })
        self.assertEqual(
            kw_extractor.lemma_token_to_POS, {
                'Basaglia': {'NOM'},
                'Casa': {'NPR'},
                'Ieri': {'ADV'},
                'Lorenzo': {'NPR'},
                'Toresini': {'NOM'},
                'andare': {'VER:pper'},
                'andato': {'VER:pper'},
                'di': {'PRE'},
                'essere': {'VER:pres'},
                'il': {'DET:def'},
                'in': {'PRE'},
                'pensione': {'NOM'},
                'primario': {'NOM'},
                'è': {'VER:pres'}
            })
        self.assertEqual(kw_extractor.title_noun_lemmas_dict, {})
        self.assertEqual(kw_extractor.tree_taggers_proper_nouns,
                         {'Basaglia', 'Lorenzo', 'Toresini'})
Beispiel #4
0
 def test_de_extract_keywords_30(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21870.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Greta Marcolongo', 'Live-Musik', 'Andrea Maffei',
             'Fußball-Übertragungen'
         })
Beispiel #5
0
 def test_it_extract_keywords_4(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "1028.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'dialogo', 'Richard Theiner', 'Svp', 'autonomia integrale',
             'sorriso degli italiani'
         })
Beispiel #6
0
 def test_it_extract_keywords_13(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10187.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Cisl Alto Adige', 'imposta sul valore aggiunto',
             "aumento dell'Iva", 'euro', 'Freiheitlichen'
         })
Beispiel #7
0
 def test_it_extract_keywords_11(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10157.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'politico', 'Andreas Perugini', 'MoVimento 5 Stelle',
             'presentato', 'programma', 'estromissione'
         })
Beispiel #8
0
 def test_it_extract_keywords_8(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10046.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Alessandro Vicentini', 'Bolzano',
             'È una questione strutturale', 'La crisi morde', 'auto'
         })
Beispiel #9
0
 def test_de_extract_keywords_52(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "22051.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Mädchen', 'Heterobaby', 'zwanzigminütigen Kurzfilm', 'Welt',
             'homosexuellen Menschen', 'Jungs'
         })
Beispiel #10
0
 def test_it_extract_keywords_20(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10275.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Karl Zeller', 'Svp', 'Eva Klotz', "accordo", 'rifugio',
             'tedesco', 'provocazione', 'funzionario', 'mozione', 'difenda'
         })
Beispiel #11
0
 def test_de_extract_keywords_35(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21921.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Karl Zeller', 'SVP-Senator', 'Abänderungsantrag',
             'RAI-Sitze der sprachlichen Minderheiten'
         })
Beispiel #12
0
 def test_de_extract_keywords_32(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21905.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Sexarbeiterin', 'Day', 'Fotostrecke', 'Problem', 'Xenia',
             'internationale Hurentag'
         })
Beispiel #13
0
 def test_de_extract_keywords_47(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "22003.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Freiheitlichen', 'SVP und PD', 'Volksabstimmung',
             'Seilbahnprojekt', 'Brixen'
         })
Beispiel #14
0
 def test_de_extract_keywords_48(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "22008.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Politikerrenten', 'Landeshauptmann Arno Kompatscher',
             'Freiheitlichen', 'Südtiroler Frühling'
         })
Beispiel #15
0
 def test_de_extract_keywords_43(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21984.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Andreas Pöder', 'Regionalratsabgeordnete',
             'Movimento 5 Stelle', 'Paul Köllensperger'
         })
Beispiel #16
0
 def test_de_extract_keywords_44(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21986.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'René Benkos', 'Busbahnhofsareals', 'Willi Hüsler',
             'Erlebnishaus Südtirol', 'Kaufhausprojekt', 'Boris Podrecca'
         })
Beispiel #17
0
 def test_de_extract_keywords_40(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21965.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Freiheitlichen', 'Stocker', 'Landtagswahlen', 'Wählerstimme',
             'Pius', 'Ulli', 'SVP'
         })
Beispiel #18
0
 def test_de_extract_keywords_53(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "22056.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Frau', 'schwer', 'Kuh', 'Franz', 'Jungbäuerin', 'Mölten',
             'Sattlerhüttenwirt'
         })
Beispiel #19
0
 def test_detect_german_in_italian(self):
     kw_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "small-mixed.txt",
         self.output_folder)
     self.assertEqual(
         kw_extractor._detect_german_in_italian(
             'Bernardo Magnagi dice spesso: Gesundheit und Danke.'), "de")
     self.assertEqual(
         kw_extractor._detect_german_in_italian(
             'Heinrich Hund dice spesso: mio dio!'), "it")
Beispiel #20
0
 def test_it_extract_keywords_24(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10293.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Giunta comunale', 'Bolzano', 'Mercatino di Natale', 'turismo',
             'Alto Adige', 'gennaio', 'ambientalista'
         })
Beispiel #21
0
 def test_it_extract_keywords_22(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10288.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Bizzo', 'tedesco', 'italiano', 'quotidiano',
             'Innovation Festival', 'Alto Adige'
         })
Beispiel #22
0
 def test_it_extract_keywords_15(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10212.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Bolzano', 'quartiere', 'partecipante',
             'bolzanobici around the world', 'dedicata alle due ruote',
             'popolare', 'iniziativa'
         })
Beispiel #23
0
 def test_de_extract_keywords_50(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "22024.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertIn('rechtextreme Bewegung', key_words_set)
     self.assertIn('rechtsradikale Bewegung', key_words_set)
     self.assertIn('DIGOS-Ermittlungen', key_words_set)
     self.assertIn('Luigi Spagnoli', key_words_set)
     self.assertIn('legge Scelba', key_words_set)
     self.assertIn('Socialismo Nazionale', key_words_set)
Beispiel #24
0
 def test_de_extract_keywords_41(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "21969.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Bewerbungsdossiers', 'Giorgio Orsoni',
             'Kulturhauptstadtregion', 'Christian Tommasini',
             'Alberto Stenico', 'Luis Durnwalder', 'Venedig', 'Nordest'
         })
Beispiel #25
0
 def test_it_extract_keywords_16(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10225.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'pulizia linguistica', 'Florian Kronbichler', 'quota Svp',
             'polemica', 'risolve', 'teoria', 'soluzione',
             'rifugi di montagna', 'ribadisce'
         })
Beispiel #26
0
    def test_it_extract_keywords(self):
        key_word_extractor = KeywordExtractor(
            os.path.join(self.script_folder, "test"), "1008-it.txt",
            self.output_folder)
        key_words_set = key_word_extractor.extract_keywords()

        self.assertTrue('pensione' in key_words_set)
        self.assertTrue('struttura' in key_words_set)
        self.assertTrue('don Chisciotte' in key_words_set)
        self.assertTrue('Franco Basaglia' in key_words_set)
        self.assertTrue('Lorenzo Toresini' in key_words_set)
Beispiel #27
0
 def test_it_extract_keywords_21(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10286.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'Arno Kompatscher', 'tre domande', 'entrature particolari',
             'il prossimo Landeshauptmann', 'democrazia diretta', 'firma',
             'legge provinciale', 'Svp'
         })
Beispiel #28
0
 def test_it_extract_keywords_28(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10328.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'studio Besier', 'comitato proALTvor', 'Tiziana Campagnoli',
             'Bressanone', 'Provincia', 'Comune', 'Stephan Besier',
             'tecnici', 'incontro', 'interrompe', 'ancora valido', 'sindaco'
         })
Beispiel #29
0
 def test_it_extract_keywords_9(self):
     key_word_extractor = KeywordExtractor(
         os.path.join(self.script_folder, "test"), "10053.txt",
         self.output_folder)
     key_words_set = key_word_extractor.extract_keywords()
     self.assertEqual(
         key_words_set, {
             'dossier', 'Bolzano', 'sito', 'Venezia e Nordest',
             'capitale europea della cultura', 'candidatura',
             'caldo involucro', 'raccomandazione'
         })
Beispiel #30
0
    def test_de_fill_dictionaries_with_treetagger(self):
        """
        Tests the class method of the KeywordExtractor _fill_dictionaries_with_treetagger.
        Passes a German sentence to this method and checks the content of dictionaries and sets filled by it.
        """
        kw_extractor = KeywordExtractor(
            os.path.join(self.script_folder, "test"), "small-mixed.txt",
            self.output_folder)
        kw_extractor._fill_dictionaries_with_treetagger(
            'Antworten auf diese Fragen gab es aus dem Passeiertal bereits einige.',
            1.0, ('auf', 'dem', 'es', 'aus'))

        self.assertEqual(kw_extractor.noun_lemma_dict, {
            'Antwort': 1.0,
            'Frage': 1.0,
            'Passeiertal': 1.0,
            'geben': 1.0
        })
        self.assertEqual(
            kw_extractor.token_to_lemma_dict_original_case, {
                'Fragen': 'frage',
                'bereits': 'bereits',
                'es': 'es',
                'aus': 'aus',
                'gab': 'geben',
                'auf': 'auf',
                'Passeiertal': 'passeiertal',
                'Antworten': 'antwort',
                'diese': 'dies',
                'einige': 'einige',
                'dem': 'die'
            })
        self.assertEqual(
            kw_extractor.lemma_token_to_POS, {
                'Frage': {'NN'},
                'die': {'ART'},
                'bereits': {'ADV'},
                'einige': {'PIS'},
                'Antworten': {'NN'},
                'auf': {'APPR'},
                'diese': {'PDAT'},
                'dem': {'ART'},
                'Fragen': {'NN'},
                'geben': {'VVFIN'},
                'Antwort': {'NN'},
                'aus': {'APPR'},
                'gab': {'VVFIN'},
                'Passeiertal': {'NN'},
                'dies': {'PDAT'},
                'es': {'PPER'}
            })
        self.assertEqual(kw_extractor.title_noun_lemmas_dict, {})
        self.assertEqual(kw_extractor.tree_taggers_proper_nouns, set())