def edmunson(self, text):

        # Sprache wählen
        language = "german"
        # Die Prozentzahl vom Schieberegler ziehen
        divident = 100 / self.scale.get()

        # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen
        parser = PlaintextParser.from_string(text, Tokenizer(language))
        stemmer = Stemmer(language)
        summarizer = Summarizer(stemmer)

        # Spezifische Wortlisten definieren
        # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert
        summarizer.stop_words = get_stop_words(language)
        summarizer.bonus_words = ["nsdgdf"]
        summarizer.stigma_words = ["mtrtf"]
        summarizer.null_words = ["zngg"]

        summary = ""
        count = 0

        # Anzahl der Sätzte zählen
        for sentence in summarizer(parser.document, 10000000000):
            count += 1

        # Die Satzanzahl aus dem Przentanteil ermitteln
        sentence_number = round(count / divident)

        # Die Sätze zu einem Text zusammenfügen
        for sentence in summarizer(parser.document, sentence_number):
            summary += " " + str(sentence)

        return summary
Beispiel #2
0
def summarize(text, sentence_count, bonus_words, language='english'):
    '''

    '''
    summarizer = EdmundsonSummarizer(Stemmer(language))
    summarizer.stop_words = get_stop_words(language)
    summarizer.bonus_words = bonus_words
    summarizer.stigma_words = ['zdfgthdvndadv']
    summarizer.null_words = stopwords.words('english')
    summary = summarizer(
        PlaintextParser(text, Tokenizer(language)).document, sentence_count)
    return summary
Beispiel #3
0
def edmundson_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = EdmundsonSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    summarizer_luhn.bonus_words = ("computing", "learning", "mobile")
    summarizer_luhn.stigma_words = ("another", "and", "some", "next")
    summarizer_luhn.null_words = ("another", "and", "some", "next")

    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
def summarize(srt_file, summarizer, n_sentences, language, bonusWords,
              stigmaWords):
    # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions.
    ##print(srt_to_doc(srt_file))
    parser = PlaintextParser.from_string(srt_to_doc(srt_file),
                                         Tokenizer(language))

    if (summarizer == 'ED'):
        summarizer = EdmundsonSummarizer()

        with open(bonusWords, "r+") as f:
            bonus_wordsList = f.readlines()
            bonus_wordsList = [x.strip() for x in bonus_wordsList]
            f.close()
        with open(stigmaWords, "r+") as f:
            stigma_wordsList = f.readlines()
            stigma_wordsList = [x.strip() for x in stigma_wordsList]
            f.close()

        summarizer.bonus_words = (bonus_wordsList)
        summarizer.stigma_words = (stigma_wordsList)
        summarizer.null_words = get_stop_words(language)
    else:
        stemmer = Stemmer(language)
        summarizer = SUMMARIZERS[summarizer](stemmer)
        summarizer.stop_words = get_stop_words(language)

    ret = []
    summarizedSubtitles = []
    # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence
    # for sentence in parser.document:
    #     print("sentence ",sentence)
    # print("cod ",srt_file)
    # for ob in srt_file:
    #         sent=srt_to_doc([ob])
    #         print("sent ",sent[4:])

    for sentence in summarizer(parser.document, n_sentences):
        # Index of the sentence
        # print("sentence ",sentence)
        index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
        # Using the index we determine the subtitle to be selected
        item = srt_file[index]
        # print("item ",item)
        summarizedSubtitles.append(item)

        # add the selected subtitle to the result array
        ret.append(srt_item_to_range(item))

    return ret, summarizedSubtitles
Beispiel #5
0
def Edmundson(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = EdmundsonSummarizer(stemmer)  # Luhn算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
Beispiel #6
0
def main(req: func.HttpRequest) -> func.HttpResponse:
    ret = ""
    logging.info('Python HTTP trigger function processed a request.')
    text = str(req.get_body())

    soup = BeautifulSoup(text, features="lxml")
    souped = soup.get_text()

    SENTENCES_COUNT = math.log2(souped.count('.'))

    parser = PlaintextParser.from_string(souped, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        ret += str(sentence)

    return func.HttpResponse(re.sub(r'\\\w{3}', '', ret))
    sys.setdefaultencoding('utf8')
    """
    nltk.data.path.append('/home/kariminf/Data/NLTK/')



    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    """

    file = open(SIZE_FILE, 'r')
    while 1:
        line = file.readline()
        if line == '':
			break;
        parts = line.split(",")
        sizes[parts[0]] = int(parts[1])
    file.close()

    nltk.data.path.append('/home/kariminf/Data/NLTK/')
    for eval in sizes:
    	txt_path = "src/body/text/en/" + eval
        parser = PlaintextParser.from_file(txt_path, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = extract(summarizer, sizes[eval])
        fout = open("baselines/EdmundsonSummarizer/en/" + eval[:-9] + ".txt", "w")
        fout.write(summary)
        fout.close()