コード例 #1
0
ファイル: sum.py プロジェクト: kalki7/CaseSum
def sumbasic(parser, sentence_count):
    summarizer_5 = SumBasicSummarizer(Stemmer(language))
    summarizer_5.stop_words = get_stop_words(language)
    summary_5 = summarizer_5(parser.document, 5)
    temp = ''
    for sentence in summary_5:
        temp = temp + str(sentence)
    return (temp)
コード例 #2
0
def __init__():
    LANGUAGE = "english"
    SENTENCES_COUNT = 1


    stemmer = Stemmer(LANGUAGE)

    lsaSummarizer = Lsa(stemmer)
    lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
    luhnSummarizer = Luhn(stemmer)
    luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
    # edmundsonSummarizer.bonus_words = get_bonus_words

    lexrankSummarizer = LexRank(stemmer)
    lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    textrankSummarizer = TxtRank(stemmer)
    textrankSummarizer.stop_words = get_stop_words(LANGUAGE)

    sumbasicSummarizer = SumBasic(stemmer)
    sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)


    klSummarizer = KL(stemmer)
    klSummarizer.stop_words = get_stop_words(LANGUAGE)

    parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE))

    allvariations = []

    for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
       # print("Summarizing text via LSA: ")
        print((str(sentence)))


        allvariations.append(sentence)
    for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Luhn: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Lexrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Textrank: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via Sumbasic: ")
        print(str(sentence))
        allvariations.append(sentence)
    for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
        #print("Summarizing text via klSum: ")
        print(str(sentence))
        allvariations.append(sentence)
        return allvariations
コード例 #3
0
ファイル: summarize.py プロジェクト: MTATrooper/TextSum_sumy
def sumbasic_summarizer(text, stemmer, language, sentences_count):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer_luhn = SumBasicSummarizer(stemmer)
    summarizer_luhn.stop_words = get_stop_words(language)
    sentences = []
    for sentence in summarizer_luhn(parser.document, sentences_count):
        a = sentence
        sentences.append(str(a))
    return "\n".join(sentences)
コード例 #4
0
def sumbasicReferenceSummary(path):
    sentencesList = []
    parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        #print(sentence._text)
        sentencesList.append(sentence._text)

    return sentencesList
コード例 #5
0
ファイル: api.py プロジェクト: ab93/Text-Summarization
def sumbasicReferenceSummary(path):	
	sentencesList=[]
	parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
	stemmer = Stemmer(LANGUAGE)
	summarizer = SumBasicSummarizer(stemmer)
	summarizer.stop_words = get_stop_words(LANGUAGE)
	

	for sentence in summarizer(parser.document, SENTENCES_COUNT):
		#print(sentence._text)
		sentencesList.append(sentence._text)

	return sentencesList
コード例 #6
0
def SumBasic(rsc_file, dst_file, count):
    language = "chinese"
    parser = PlaintextParser.from_file(rsc_file,
                                       Tokenizer(language),
                                       encoding='utf-8')
    stemmer = Stemmer(language)  # 语言容器

    summarizer = SumBasicSummarizer(stemmer)  # LSA算法
    summarizer.stop_words = get_stop_words(language)
    with open(dst_file, 'w', encoding='utf-8') as f:
        for sentence in summarizer(parser.document, count):
            f.write(str(sentence))
            f.write('\n')
            print(sentence)
コード例 #7
0
ファイル: api-server.py プロジェクト: adamisntdead/hasty
def summary(url):
  parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
  stemmer = Stemmer(LANGUAGE)

  summarizer = Summarizer(stemmer)
  summarizer.stop_words = get_stop_words(LANGUAGE)

  res = []

  for sentence in summarizer(parser.document, SENTENCES_COUNT):
    print(type(sentence))
    res.append(sentence._text)

  return res
コード例 #8
0
def summarize(test_path, decoder_path):
    summarizers = {
        'lexrank': LexRankSummarizer(),
        'lsa': LsaSummarizer(),
        'sumbasic': SumBasicSummarizer(),
        'textrank': TextRankSummarizer()
    }
    for each in ['lexrank', 'lsa', 'sumbasic', 'textrank']:
        print("###################### %s #######################" % each)
        files = list(iter_files(test_path))
        dec_dir = join(decoder_path, each, 'output')
        if not os.path.exists(dec_dir):
            os.makedirs(dec_dir)

        summarizer = summarizers[each]
        for file in tqdm(files):
            name = os.path.basename(file)
            name, _ = os.path.splitext(name)
            save_path = join(dec_dir, name + '.dec')
            article = ' '.join(json.load(open(file))['article'])
            article = PlaintextParser.from_string(article,
                                                  Tokenizer('english'))
            output = summarizer(article.document, sentences_count=5)
            output = [each._text for each in output]
            with open(save_path, 'w') as f:
                f.write('\n'.join(output))
コード例 #9
0
def summarize_url(url, summarizer):
    # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html"
    print 'Summarizing ', url
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    if summarizer == 'luhn':
        summarizer = LuhnSummarizer(stemmer)
    elif summarizer == 'edmundson':
        summarizer = ESummarizer(stemmer)
    elif summarizer == 'lsa':
        summarizer = LsaSummarizer(stemmer)
    elif summarizer == 'lex':
        summarizer = LexSummarizer(stemmer)
    elif summarizer == 'text':
        summarizer = TextSummarizer(stemmer)
    elif summarizer == 'sb':
        summarizer = SumBasicSummarizer(stemmer)
    else:
        summarizer = KLSummarizer(stemmer)

    summarizer.stop_words = get_stop_words(LANGUAGE)
    print summarizer

    sentences = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print sentence
        sentences.append(str(sentence))

    return sentences
コード例 #10
0
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'):
    """
    Create an extractive summary for a chapter of the book.

    Parameters:
    book_id: (str) the book identifier
    chapter: is the chapter number to summarize
    num_sentences: how many sentences to extract

    Returns:
    sentences: the extracted sentences
    """
    chapter_filename = get_data_filename(book_id, 'book_chapters', chapter)
    parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english"))
    if technique == 'lsa':
        summarizer = LsaSummarizer()
    elif technique == 'lexrank':
        summarizer = LexRankSummarizer()
    elif technique == 'textrank':
        summarizer = TextRankSummarizer()
    elif technique == 'kl':
        summarizer = KLSummarizer()
    elif technique == 'random':
        summarizer = RandomSummarizer()
    elif technique == 'reduction':
        summarizer = ReductionSummarizer()
    elif technique == 'sumbasic':
        summarizer = SumBasicSummarizer()
    else:
        summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary
コード例 #11
0
    def summarize(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)
        summary = " ".join(
            [obj._text for obj in summarizer(parser.document, length)])

        return summary
コード例 #12
0
def SumBasicSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = SumBasicSummarizer()
    summary = summarizer(parser.document, sentences)
    # for sentence in summary:
    #     print(sentence)
    return summary
コード例 #13
0
    def summarize_with_info(self, corpus, length, algorithm):
        parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE))

        if algorithm == "textrank":
            summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lexrank":
            summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "luhn":
            summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "edmundson":
            summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE))
            summarizer.bonus_words = parser.significant_words
            summarizer.stigma_words = parser.stigma_words
        elif algorithm == "kl":
            summarizer = KLSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "lsa":
            summarizer = LsaSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "sumbasic":
            summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE))
        elif algorithm == "random":
            summarizer = RandomSummarizer(Stemmer(self.LANGUAGE))
        else:
            raise NotImplemented("Summary algorithm is not available")

        summarizer.stop_words = get_stop_words(self.LANGUAGE)

        return summarizer(parser.document, length)
コード例 #14
0
def SumBasicSummary(document, sentences):
    parser = PlaintextParser.from_string(document, Tokenizer("english"))
    summarizer = SumBasicSummarizer()
    summary = summarizer(parser.document, sentences)
    results = []
    for sentence in summary:
        results.append(str(sentence))
    return results
コード例 #15
0
 def __init__(self):
     """
     Oracle summariser is not an actual, usable summariser. It extracts the best sentences from the paper possible
     by comparing them to the gold summaries. It represents the high-water mark in what ROUGE score it is possible
     for a summariser to achieve.
     """
     self.summary_length = 10
     self.summariser = SumBasicSummarizer()
コード例 #16
0
def generate_benchmark_summary(filename, num_summary):

    parser = PlaintextParser.from_file(
        'data/text_summary/' + filename + '.txt', Tokenizer("english"))
    print('=========== Basic Sum ============')
    Basic_Sum_sentences = []
    summarizer = SumBasicSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        Basic_Sum_sentences.append(str(sentence))

    print('=========== LSA ============')
    LSA_sentences = []
    summarizer = LsaSummarizer()

    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        LSA_sentences.append(str(sentence))

    print('===========LexRank============')
    LexRank_sentences = []
    summarizer = LexRankSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        LexRank_sentences.append(str(sentence))

    print('===========KL Divergence============')
    KL_sentences = []
    summarizer = KLSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        KL_sentences.append(str(sentence))

    print('===========Luhn============')
    Luhn_sentences = []
    summarizer = LuhnSummarizer()
    summary = summarizer(
        parser.document,
        num_summary)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
        Luhn_sentences.append(str(sentence))

    return Basic_Sum_sentences, LSA_sentences, LexRank_sentences, KL_sentences, Luhn_sentences
コード例 #17
0
    def __init__(self, num_sentence, trim_len=5000):
        self.num_sentence = num_sentence
        self.trim_len = trim_len
        self.tokenizer = Tokenizer('english')

        self.summarizers = [
            LexRankSummarizer(),
            LsaSummarizer(),
            SumBasicSummarizer()
        ]
        self.num_summarizers = len(self.summarizers)
コード例 #18
0
ファイル: baseline.py プロジェクト: lightingghost/hier_lstm
def basic_sum(file, test_ratio=0.10, israndom=True):
    # extract test files
    file_lines = file.read().splitlines()
    nsamples = len(file_lines)
    ntests = int(nsamples * test_ratio)
    if israndom:
        seq = np.random.permutation(nsamples)
    else:
        seq = np.arange(nsamples)
    
    # summerizer
    stemmer = Stemmer(_language)
    summarizer = Summarizer (stemmer)
    summarizer.stop_words = get_stop_words(_language)
    
    # rouge
    rouge = Rouge155()
    
    scores = defaultdict(list)
    for i in range(ntests):
        line = file_lines[seq[i]]
        sample = json.loads(line)
        content = sample['content']
        title = sample['title']
        ref_text = {'A': title}
        doc = ' '.join(content)
        parser = PlaintextParser.from_string(doc, Tokenizer(_language))
        sum_sents = summarizer(parser.document, _sent_count)
        if len(sum_sents) != _sent_count:
            continue
        summary = str(sum_sents[0])
        score = rouge.score_summary(summary, ref_text)
        for k, v in score.items():
            scores[k].append(v)
        print('{} / {} processed.'.format(i, ntests), end='\r')
    result = {}
    for k, v in scores.items():
        result[k] = mean(v)
    return result
コード例 #19
0
def evaluate_summary(file_name, input_dir, sent_count, lingua_franca_summary, show_summaries):
	method_name = inspect.stack()[0][3]
	try:
		process_logger.debug("in "+ method_name +" method")
		file_model_summary = open(input_dir + file_name +".model", "r")
		model_summary = file_model_summary.read()

		rouge_scores_dict = {}
		rouge_scores = rouge_evaluation(lingua_franca_summary, model_summary)
		rouge_scores_dict[">>LINGUA FRANCA"] = rouge_scores
		file_summary = open("Test System Summary/" + file_name + "-" + "LINGUA FRANCA" + ".txt", "w")
		file_summary.write(lingua_franca_summary)

		LANGUAGE = "english"
		parser = PlaintextParser.from_file(input_dir + file_name + ".txt", Tokenizer(LANGUAGE))
		stemmer = Stemmer(LANGUAGE)
		
		lsa_summarizer = LsaSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LSA", lsa_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LSA"] = rouge_scores		

		lex_summarizer = LexRankSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LEX RANK", lex_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LEX RANK"] = rouge_scores

		luhn_summarizer = LuhnSummarizer(stemmer)
		rouge_scores = sumy_summarizers("LUHN", luhn_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["LUHN"] = rouge_scores
		
		text_rank_summarizer = TextRankSummarizer(stemmer)
		rouge_scores = sumy_summarizers("TEXT RANK", text_rank_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["TEXT RANK"] = rouge_scores
		
		sum_basic_summarizer = SumBasicSummarizer(stemmer)
		rouge_scores = sumy_summarizers("SUM BASIC", sum_basic_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["SUM BASIC"] = rouge_scores
		
		kl_summarizer = KLSummarizer(stemmer)
		rouge_scores = sumy_summarizers("KL SUM", kl_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name)
		rouge_scores_dict["KL SUM"] = rouge_scores
		
		# score_reader(rouge_scores_dict)
		df_rouge, summarizer_list = process_rouge_scores(rouge_scores_dict)

		return df_rouge, summarizer_list

	except Exception as Ex:
		error_logger.error("Exception occurred in " + method_name + "| Exception:" + str(Ex))
		return None
コード例 #20
0
def sum_basic(nb_sentences):
    """ Sumbasic function for automatic summarization.

    INPUT:
    -----
    nb_sentences    the number of sentences for the final event summaries.

    OUTPUT:
    ------
    Done    It means that the methods has finished and the result is available
            in the folder called 'SumBasicResult'.
    """

    # Create the final folder that will contain event summaries
    try:
        path = "SumBasicResult/" + str(nb_sentences)
        os.system("mkdir SumBasicResult")
        os.mkdir(path)
    except OSError:
        print("Folder SumBasicResult already created !")
    # Construct the path to the post-processed events collection
    event_collection = glob.glob1("FinalCollection", "*.txt")
    event_collection = [
        "FinalCollection/" + event for event in event_collection
    ]
    for event in event_collection:
        # Define a Plaintext parser for event text files
        parser = PlaintextParser.from_file(event, Tokenizer("english"))
        # Get the event id
        var = os.path.splitext(event)[0]
        id_event = var.replace("FinalCollection/", "")
        # Get the event total number of lines (sentences)
        with open("FinalCollection/" + id_event + ".txt",
                  encoding="utf-8") as file:
            nb_lines = len(file.readlines())
        # Summarize the document using Sumbasic method. We keep at the end nb_sentences sentences.
        summarizer = SumBasicSummarizer()
        summary = summarizer(parser.document, min(nb_sentences, nb_lines))
        event_summary = open(
            'SumBasicResult/' + str(nb_sentences) + '/' + id_event + '.txt',
            "wb")
        for sentence in summary:
            event_summary.write(
                ((str(sentence) + "\r\n")).encode('utf-8', 'ignore'))
        event_summary.close()
    return "Done"
コード例 #21
0
    def choose_summarizer(self, summarizer_string: str):
        logging.debug("Changing summarizer to: {}".format(summarizer_string))
        if summarizer_string == "LexRank":  # LexRank
            self.summarizer = LexRankSummarizer(stemmer)

        elif summarizer_string == "TextRank":  # TextRank
            self.summarizer = TextRankSummarizer(stemmer)

        elif summarizer_string == "Luhn":  # Luhn
            self.summarizer = LuhnSummarizer(stemmer)

        elif summarizer_string == "LSA":  # LSA
            self.summarizer = LsaSummarizer(stemmer)

        elif summarizer_string == "SumBasic":  # SumBasic
            self.summarizer = SumBasicSummarizer(stemmer)

        # allow summarizer to take stop words into account
        self.summarizer.stop_words = get_stop_words(LANGUAGE)
コード例 #22
0
    def get_summarizers(self, names):
        """Retrieves sumy summarizers algorithms

            Parameters:
            names (list): list of summarizer algorithm names

            Returns:
            dict:summarizers

        """
        summarizers = {}
        for name in names:
            if name == "random":
                from sumy.summarizers.random import RandomSummarizer
                summarizers["random"] = RandomSummarizer(null_stemmer)
            elif name == "luhn":
                from sumy.summarizers.luhn import LuhnSummarizer
                summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer)
            elif name == "lsa":
                from sumy.summarizers.lsa import LsaSummarizer
                summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer)
            elif name == "lexrank":
                from sumy.summarizers.lex_rank import LexRankSummarizer
                summarizers["lexrank"] = LexRankSummarizer(null_stemmer)
            elif name == "textrank":
                from sumy.summarizers.text_rank import TextRankSummarizer
                summarizers["textrank"] = TextRankSummarizer(null_stemmer)
            elif name == "sumbasic":
                from sumy.summarizers.sum_basic import SumBasicSummarizer
                summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer)
            elif name == "kl-sum":
                from sumy.summarizers.kl import KLSummarizer
                summarizers["kl-sum"] = KLSummarizer(null_stemmer)
            elif name == "reduction":
                from sumy.summarizers.reduction import ReductionSummarizer
                summarizers["reduction"] = ReductionSummarizer(null_stemmer)

        for _, summarizer in summarizers.items():
            summarizer.stop_words = frozenset(
                self.stop_words._get_stop_words(custom_stop_words=[]))

        return summarizers
コード例 #23
0
    def __init__(self, method=None, nltk_directory=None, language=None):
        if (language):
            logger.info("Setting language to " + language)
            LANGUAGE = language
        else:
            LANGUAGE = "english"
        # Set the location of the nltk data directory for tokenizers, etc.
        if nltk_directory:
            nltk.data.path.append(nltk_directory)
            logger.info(nltk.data.path)
        try:
            self.stemmer = Stemmer(LANGUAGE)
        except Exception:
            logger.exception("Error loading nltk stemmer")
            raise Exception("Error loading nltk stemmer")

        self.summarizer = Summarizer(self.stemmer)  # default
        if method:
            if (method == 'luhn'):
                logger.info("Using the Luhn summarizer!")
                self.summarizer = LuhnSummarizer(self.stemmer)
            elif (method == 'edmundson'):
                logger.info("Using the Edmundson summarizer!")
                self.summarizer = EdmundsonSummarizer(self.stemmer)
            elif (method == 'lsa'):
                logger.info("Using the LSA summarizer!")
                self.summarizer = LsaSummarizer(self.stemmer)
            elif (method == 'text_rank'):
                logger.info("Using the Text Rank summarizer!")
                self.summarizer = TextRankSummarizer(self.stemmer)
            elif (method == 'sum_basic'):
                logger.info("Using the Sum Basic summarizer!")
                self.summarizer = SumBasicSummarizer(self.stemmer)
            elif (method == 'kl'):
                logger.info("Using the KL summarizer!")
                self.summarizer = KLSummarizer(self.stemmer)
            elif (method == 'lex_rank'):
                logger.info("Using the LexRank summarizer!")
                self.summarizer = LexRankSummarizer(self.stemmer)
        #print(method)
        self.summarizer.stop_words = get_stop_words(LANGUAGE)
コード例 #24
0
def summary_benchmarks(sentences_string):
    '''
    :param sentences_string: all sentences as one string, has been tokenized
    :return:
    '''
    parser = PlaintextParser.from_string(sentences_string,
                                         Tokenizer("english"))
    print('=========== Basic Sum ============')
    summarizer = SumBasicSummarizer()
    summary = summarizer(parser.document,
                         3)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence

    print('=========== LSA ============')
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document,
                         3)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence

    print('===========LexRank============')
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document,
                         3)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence

    print('===========KL Divergence============')
    summarizer = KLSummarizer()
    summary = summarizer(parser.document,
                         3)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence

    print('===========Luhn============')
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document,
                         3)  # Summarize the document with 5 sentences
    for sentence in summary:
        print sentence
コード例 #25
0
    def __init__(self, name):

        self.stemmer = Stemmer('english')
        self.name = name

        if name == "TextRankSummarizer":
            self.summarizer = TextRankSummarizer(self.stemmer)
        elif name == "LsaSummarizer":
            self.summarizer = LsaSummarizer(self.stemmer)
        elif name == "LuhnSummarizer":
            self.summarizer = LuhnSummarizer(self.stemmer)
        elif name == "LexRankSummarizer":
            setattr(LexRankSummarizer, 'rate_sentences', rate_sentences)
            self.summarizer = LexRankSummarizer(self.stemmer)

        elif name == "SumBasicSummarizer":
            self.summarizer = SumBasicSummarizer(self.stemmer)
        elif name == "KLSummarizer":
            self.summarizer = KLSummarizer(self.stemmer)

        #summarizer = EdmundsonSummarizer(stemmer)
        self.summarizer.stop_words = get_stop_words('english')
コード例 #26
0
ファイル: test_sum_basic.py プロジェクト: JyothsnaKS/sumy
 def _build_summarizer(self, stop_words):
     summarizer = SumBasicSummarizer()
     summarizer.stop_words = stop_words
     return summarizer
コード例 #27
0
def _build_summarizer(stop_words, stemmer=None):
    summarizer = SumBasicSummarizer(
    ) if stemmer is None else SumBasicSummarizer(stemmer)
    summarizer.stop_words = stop_words
    return summarizer
コード例 #28
0
import os


#create folder
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)


LANGUAGE = "bangla"
SENTENCES_COUNT = 2

if __name__ == "__main__":

    createFolder('Dataset/NCTB/SumBasicSummary/')
    for i in range(1, 140):
        serial_no = str(i)
        path = "Dataset/NCTB/Source/" + serial_no + ".txt"
        parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary = ""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary = summary + " " + str(sentence)
        fi = open('Dataset/NCTB/SumBasicSummary/' + serial_no + '.txt', '+w')
        fi.write(summary)
コード例 #29
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer 
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys


LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]


if __name__ == "__main__":
    
    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

コード例 #30
0
ファイル: run_sumy.py プロジェクト: abiraja2004/awesome_nlp
def run_SumBasic(stemmer, document, n):
    luhn = SumBasicSummarizer(stemmer)
    luhn.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic: {}".format(n))
    res = luhn(document, SENTENCES_COUNT)
    return " ".join(str(res[0]).split()[:n])
コード例 #31
0
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import sys

LANGUAGE = "english"
SENTENCES_COUNT = int(sys.argv[2])
text_file = sys.argv[1]

if __name__ == "__main__":

    parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
コード例 #32
0
ファイル: extractive.py プロジェクト: martinhartt/HGfGT
from sumy.nlp.tokenizers import Tokenizer
import sys


def leadSummariser(document, no_of_sents):
    for sent in document.sentences[:no_of_sents]:
        yield str(sent)


summarisers = {
    "lead": leadSummariser,
    "luhn": LuhnSummarizer(),
    "lsa": LsaSummarizer(),
    "lex_rank": LexRankSummarizer(),
    "text_rank": TextRankSummarizer(),
    "sum_basic": SumBasicSummarizer(),
    "kl": KLSummarizer()
}

tokenizer = Tokenizer("english")


def to_words(str):
    return str.split(" ")


def extractive(article, title=None):
    raw = article.replace(' <sb>', '').strip()

    parser = PlaintextParser.from_string(raw, tokenizer)
コード例 #33
0
 def _build_summarizer(self, stop_words):
     summarizer = SumBasicSummarizer()
     summarizer.stop_words = stop_words
     return summarizer
コード例 #34
0
 def __summarize(self, parser):
     summarizer = SumBasicSummarizer(Stemmer(self.__language))
     summarizer.stop_words = get_stop_words(self.__language)
     final_sentences = summarizer(parser.document, self.__sentences_count)
     return self.__join_sentences(final_sentences)