def __init__(self, language):
        
        self.language = language

        print "Loading external word lists..."
        print
        self.external_lst = LoadExternalLists()
        self.titled_stopwords = self.external_lst.loadTitledStopwords()
        self.ABBREVIATIONS = self.external_lst.loadAbbreviations()

        if self.language == 'en':
            self.stopwords = self.external_lst.loadStopWordsEN()
            self.VERBTRANSFORMS = self.external_lst.loadVerbForms()
            self.NOUNTRANSFORMS = self.external_lst.loadNounforms()
            self.terms_dict = self.external_lst.loadCorpusEN()
            self.lexicon_de = ''
            self.ger_nn = ''
            self.ger_ne = ''
        
        if self.language == 'ru':
            self.stopwords = self.external_lst.loadStopWordsRU()
            self.terms_dict = self.external_lst.loadCorpusRU()
            self.VERBTRANSFORMS = ''
            self.NOUNTRANSFORMS = ''
            self.lexicon_de = ''
            self.ger_nn = ''
            self.ger_ne = ''
        
        if self.language == 'de':
            self.lexicon_de = self.external_lst.loadLexiconDE()
            self.stopwords = self.external_lst.loadStopWordsDE()
            self.ger_nn = self.external_lst.loadGermanNN()
            self.ger_ne = self.external_lst.loadGermanNE()
            self.terms_dict = self.external_lst.loadCorpusDE()
            self.VERBTRANSFORMS = ''
            self.NOUNTRANSFORMS = ''

        # стеммируются предложения
        self.stem_sents = SentenceSplitter(self.stopwords, self.VERBTRANSFORMS, self.NOUNTRANSFORMS, self.lexicon_de, self.language)
class SUMMARIZER(object):

    def __init__(self, language):
        
        self.language = language

        print "Loading external word lists..."
        print
        self.external_lst = LoadExternalLists()
        self.titled_stopwords = self.external_lst.loadTitledStopwords()
        self.ABBREVIATIONS = self.external_lst.loadAbbreviations()

        if self.language == 'en':
            self.stopwords = self.external_lst.loadStopWordsEN()
            self.VERBTRANSFORMS = self.external_lst.loadVerbForms()
            self.NOUNTRANSFORMS = self.external_lst.loadNounforms()
            self.terms_dict = self.external_lst.loadCorpusEN()
            self.lexicon_de = ''
            self.ger_nn = ''
            self.ger_ne = ''
        
        if self.language == 'ru':
            self.stopwords = self.external_lst.loadStopWordsRU()
            self.terms_dict = self.external_lst.loadCorpusRU()
            self.VERBTRANSFORMS = ''
            self.NOUNTRANSFORMS = ''
            self.lexicon_de = ''
            self.ger_nn = ''
            self.ger_ne = ''
        
        if self.language == 'de':
            self.lexicon_de = self.external_lst.loadLexiconDE()
            self.stopwords = self.external_lst.loadStopWordsDE()
            self.ger_nn = self.external_lst.loadGermanNN()
            self.ger_ne = self.external_lst.loadGermanNE()
            self.terms_dict = self.external_lst.loadCorpusDE()
            self.VERBTRANSFORMS = ''
            self.NOUNTRANSFORMS = ''

        # стеммируются предложения
        self.stem_sents = SentenceSplitter(self.stopwords, self.VERBTRANSFORMS, self.NOUNTRANSFORMS, self.lexicon_de, self.language)


    def summarize(self, text):

        # статья для обработки
        OPENTEXT = HTMLParser().unescape(text)

        # разбиваем в LIST_OF_SENTENCES входной текст
        textsegmentor = TextSegmentor(self.titled_stopwords, self.ABBREVIATIONS, self.language)
        LIST_OF_SENTENCES, TTL = textsegmentor.segment(OPENTEXT)
        
        # склеиваем все списки в один простой список ALLSENTENCES для статистики по предложениям
        ALLSENTENCES = list(itertools.chain.from_iterable(LIST_OF_SENTENCES))

        # для методики симметричного реф-я нужно не менее 3-х предложений
        if len(ALLSENTENCES) >= 3:
            
            # стеммируются предложения
            # список предложений из основ слов, предложения сгруппированны по абзацам
            STEMMED_SENTENCES = self.stem_sents.tokenizeListParagraphs(LIST_OF_SENTENCES)

            # стеммируется заголовок
            if len(TTL) > 0:
                TITLE_PAIRS = list(itertools.chain.from_iterable(self.stem_sents.tokenizeListSentences(TTL)))
                TITLE = [pair[0] for pair in TITLE_PAIRS]

            else:
                TITLE = []
            
            # список предложений без границ абзацев, предложения разбиты на основы
            NO_PARAGRAPHS = list(itertools.chain.from_iterable(STEMMED_SENTENCES))
            # большой список всех основ слов для подсчета частотности (TF/IDF)
            BIG_LIST_OF_PAIRS = list(itertools.chain.from_iterable(itertools.chain.from_iterable(STEMMED_SENTENCES)))
            BIG_LIST_OF_STEMS = [pair1[0] for pair1 in BIG_LIST_OF_PAIRS]


            # общее количество стем в тексте
            TOTAL_STEMS_IN_TEXT = len(BIG_LIST_OF_STEMS)
            # общее количество предложений в тексте
            TOTAL_SENTS_IN_TEXT = len(ALLSENTENCES)


            if len(BIG_LIST_OF_STEMS) > 0:

                w_count = CountTermWeights(self.language)
                
                # список кортежей (слово, его частота), усечённый по средней частоте
                TOTAL_STEM_COUNT, ABSOLUTE_COUNT = w_count.simpleTermFreqCount(BIG_LIST_OF_STEMS)

                # список "имён собственных"
                PROPER_NOUNS, STEMMED_PNN = FindProperNouns(self.language).lookForProper(ALLSENTENCES, self.stopwords, self.VERBTRANSFORMS, self.NOUNTRANSFORMS, self.lexicon_de, self.ger_nn, self.ger_ne)

                # список терминов с весовыми коэффициентами (кортежи)
                SORTED_TFIDF = w_count.countPureTFIDF(TOTAL_STEM_COUNT, self.terms_dict)
                FINAL_SORTED_TFIDF = w_count.countFinalWeights(SORTED_TFIDF, TITLE, STEMMED_SENTENCES, ALLSENTENCES, TOTAL_STEMS_IN_TEXT, TOTAL_SENTS_IN_TEXT, self.stopwords, self.VERBTRANSFORMS, self.NOUNTRANSFORMS, STEMMED_PNN, self.lexicon_de)
                KEYWORDS = w_count.showKeywords(BIG_LIST_OF_PAIRS, FINAL_SORTED_TFIDF, ABSOLUTE_COUNT, PROPER_NOUNS)
                
                
                # объект класса для вычисления симметричной связи предложений
                # вычисляем вес каждого предложения
                symmetry = SymmetricalSummarizationWeightCount()
                # словари каждого предложения с частотностью по словам
                S_with_termfreqs = symmetry.countTermsInsideSents(NO_PARAGRAPHS)
                SYMMETRICAL_WEIGHTS = symmetry.countFinalSymmetryWeight(FINAL_SORTED_TFIDF, S_with_termfreqs, TOTAL_STEMS_IN_TEXT, TOTAL_SENTS_IN_TEXT, STEMMED_PNN)
                ORIGINAL_SENTENCES = symmetry.convertSymmetryToOrdinary(SYMMETRICAL_WEIGHTS, ALLSENTENCES)

                q, rate = symmetry.selectFinalSents(ORIGINAL_SENTENCES)

                # KWIS = KeywordsInSummary()
                # kwis = KWIS.showKWIS(q, KEYWORDS)

            else:
                print "There are no words to process!"

        else:
            q = ''

            print "Text should be at least 3 sentences long."


        #### saving #####
        kw = True
        with codecs.open(r"output.html",'w','utf-16') as outfile:
            outfile.write("<html><body>" +'\n')
            outfile.write("<style>.beta{position:absolute;left:42px;right:42px;top:10px;}</style>" +'\n')
            outfile.write("<div class='beta'>" +'\n')
            outfile.write("<p style='font-family:verdana'><b>"+"Summary of the given article </b>")
            for ttl in TTL:
                outfile.write("<b>"+"'" + ttl + "'" + "</b></p>" + '\n'+'\n')
            outfile.write("<table style='font-family:Calibri' align='justify'><td>"+'\n')
            for sent3 in range(len(q)):
                outfile.write("<tr>")
                outfile.write("<td>")
                outfile.write(q[sent3][0])
                outfile.write("</td>")
                outfile.write("<td>")
                outfile.write(str(round(q[sent3][1], 3)) +'\t')
                outfile.write(str(q[sent3][2]))
                outfile.write("</td>")
                outfile.write("</tr>"+'\n')
                outfile.write('\n' + '\n')
            outfile.write("</td></table>"+'\n')
            outfile.write("<p style='font-family:Calibri'><b>Number of sentences in the text: </b>" + str(TOTAL_SENTS_IN_TEXT) + "</p>" + '\n'+'\n')
            outfile.write("<p style='font-family:Calibri'><b>The rate of original text compression: </b>" + str(rate) + " sentences</p>" + '\n')
            
            if kw:
                outfile.write("<p style='font-family:Calibri'><b>Keywords of the artice: </b></p>")
                outfile.write("<table style='font-family:Calibri'><td>"+'\n')
                for key, rel, weight in KEYWORDS:
                    outfile.write("<tr>")
                    outfile.write("<td>")
                    outfile.write(", ".join(key))
                    outfile.write("</td>")
                    outfile.write("<td>")
                    outfile.write(str(rel))
                    outfile.write("</td>")
                    outfile.write("<td>")
                    outfile.write(str(round(weight, 3)))
                    outfile.write("</td>")
                    outfile.write("</tr>"+'\n')
                outfile.write("</td></table>"+'\n')
            outfile.write("</div>")
            outfile.write("</body></html>")