Python summarize Examples, summa.summarizer.summarize Python Examples

Example #1

1

Show file

File: test_summarizer.py Project: pdmorale/Summarization

 def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         summarize(bytes, language="spanish")

Example #2

1

Show file

File: main.py Project: Dante319/sentiment-analysis-movie-reviews

def text_summarize():
    neg = "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/test/neg/"
    pos = "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/test/pos/"
    outfile = open(
        "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/train/res.csv",
        "w+",
        newline='')
    count = 0

    for files in glob.glob(neg + "*.txt"):
        count = count + 1
        infile = open(files, errors='ignore')
        text = infile.read()
        res = summarize(text, ratio=0.2)
        temp = ""
        for line in res:
            temp += line.rstrip('\n')
        CSVWriter = csv.writer(outfile)
        CSVWriter.writerow(['0', str(temp)])  #0 = Negative Review

    for files in glob.glob(pos + "*.txt"):
        count = count + 1
        infile = open(files, errors='ignore')
        text = infile.read()
        res = summarize(text, ratio=0.2)
        temp = ""
        for line in res:
            temp += line.rstrip('\n')
        CSVWriter = csv.writer(outfile)
        CSVWriter.writerow(
            ['1', str(temp)]
        )  #1 = Positive Review #This writes a row with label and summarized review to a csv file

Example #3

1

Show file

File: views.py Project: deeshashah/AutoShorts

def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get(
            'https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2'
        )
        response_dictionary = {}
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0
        for item in data["articles"]:
            mid_dictionary = {}
            if (type(item["title"]) == "unicode"):
                ti = unicodedata.normalize('NFKD', item["title"]).encode(
                    'ascii', 'ignore')
            else:
                ti = item["title"]
            n = len(ti)
            if (n > 10):
                mid_dictionary["title"] = ti
                sc = Scraper()
                print item
                mid_dictionary["data"] = sc.scrape_mirror(item["url"])
                article_data.append(mid_dictionary)
                mid_dictionary["image"] = (item["urlToImage"])
                print mid_dictionary["image"]
                if (type(mid_dictionary["data"]) == "unicode"):
                    st = unicodedata.normalize('NFKD',
                                               mid_dictionary["data"]).encode(
                                                   'ascii', 'ignore')
                else:
                    st = mid_dictionary["data"]

                try:

                    print "********Summary******"
                    summary = summarizer.summarize(st, words=50)
                    print summary.encode('ascii', 'ignore')
                    print "---------Summary---------"
                    mid_dictionary["summary"] = summarizer.summarize(st,
                                                                     words=50)
                except ZeroDivisionError:
                    mid_dictionary["summary"] = st

                print mid_dictionary
                main_dict["article_" + str(i)] = mid_dictionary
                i = i + 1

        response_dictionary["articles"] = article_data

        print main_dict
        for key, value in main_dict.iteritems():
            print "***************"

        return render(request, 'accounts/dashboard.html',
                      {'main_dict': main_dict})
    else:
        return HttpResponseRedirect("/login")

Example #4

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         summarize(bytes, language="spanish")

Example #5

0

Show file

File: text_summary.py Project: pdmorale/Summarization

def Summarizer(text, ratio, word):
    language = detect(text)
    if word == "Disabled":
        if language == "en":
            from summa.summarizer import summarize
            smzTxt = summarize(text, ratio=float(ratio))
        else:
            from textrank.summa import summarizer
            smzTxt = summarizer.summarize(text, ratio=float(ratio))
    else:
        if language == "en":
            from summa.summarizer import summarize

            smzTxt = summarize(text, words=int(word))
        else:
            from textrank.summa import summarizer
            smzTxt = summarizer.summarize(text, words=int(word))

    outputText = ",".join(smzTxt).replace(",", "")
    if outputText == "":
        return None
    # with codecs.open("temp.txt", 'w', 'utf-8', 'ignore') as f:
    # 	f.write(outputText)
    # 	f.close()
    # f = open("temp.txt", "w").close()
    # 	f = open("temp.txt", "w+")
    return outputText

Example #6

0

Show file

File: backupView.py Project: deeshashah/AutoShorts

def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get('https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2')
        response_dictionary = {}
        # response_dictionary["status"] = r
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        # print data["articles"]
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0;
        for item in data["articles"]:
            mid_dictionary = {}
            mid_dictionary["title"] = item["title"]
            #mid_dictionary["url"] = item["url"]
            #mid_dictionary["urlToImage"] = item["urlToImage"]
            sc = Scraper()
            mid_dictionary["data"] = sc.scrape_mirror(item["url"])
            article_data.append(mid_dictionary)
            print type(mid_dictionary["data"])
            if(type(mid_dictionary["data"])=="unicode"):
                st = unicodedata.normalize('NFKD', mid_dictionary["data"]).encode('ascii','ignore')
            else:
                st = mid_dictionary["data"]
            if(len(st)>250):
                print "********Summary******"
                print summarizer.summarize(st,words=50)
                print "---------Summary---------"
                mid_dictionary["summary"] = summarizer.summarize(st,words=50)
            else :
                mid_dictionary["summary"] = st

            mid_dictionary["article_"+str(i)] = mid_dictionary
            i = i+1
        
        response_dictionary["articles"] = article_data
        
        # news_dictionary = {}
        #print main_dict
        for key,value in main_dict.iteritems():
            print "***************"
            # print key
            
            
            print value["title"]
        # # print news_dictionary
        # list_of_urls = []
        # return render(request, 'newslist.html', {'newslist':response_dictionary})
        return render(request,
                'accounts/dashboard.html',
                {'main_dict': main_dict}
                )
    else:
         return HttpResponseRedirect("/login")

Example #7

0

Show file

File: newshunt.py Project: shoaibkhandev/preesize

def my_form_post1():
    summ = ""
    if len(request.get_json(silent=True)) > 0:
        data = request.get_json(silent=True)
        text = data.get('fileText')
        method = data.get('method')
        if method == 1:
            summ = summarizer.summarize(text)
        elif method == 2:
            summ = summarizer.summarize(text, ratio=0.4)

    return Response(summ)

Example #8

0

Show file

def summarize(text, file=False, output_file=None):
	summary = ''
	if(file == True):
		f = open(text).read()
		summary = summarizer.summarize(f, words = 50)
	else:
		summary = summarizer.summarize(text, words = 50)

	if(output_file is not None):
		f = open(output_file, 'w')
		f.write(summary)
		f.close()

	return summary

Example #9

0

Show file

    def SummaryTheText(self, SectionalText, CallNumber=0):
        '''
        ----------
        Function
        ----------
        * recievs sectional text
        * Cleans the sectional text
        * Summarise based on text size
        * Saves the summary to Summary.txt file for each section
        
        --------
        INPUT
        --------
        SectionalText = Section Vise text for summarization
        CallNumber = Used to iterate over differend files differentiated based on
                     number (Default = 0)
        
        -------
        RETURN
        -------
        None
        
        '''
        OutputFileNamePAth = 'Summary' + str(CallNumber) + '.txt'
        logging.info("Inside SummaryCreator")
        logging.info("Creating Summary by opening Summary%s.txt", CallNumber)
        with open(OutputFileNamePAth, 'w') as f:

            text = re.sub("∑", " \n", SectionalText)
            text = unidecode(text)
            text = re.sub(r'[^\x00-\x7F]+', '', text)

            if (len(text) <= 300):
                text = summarize('Test', text, count=4)
                text = "\n".join(text)
                text = re.sub('IntroductionThis', 'Introduction This', text)
            else:
                text = summarizer.summarize(text, ratio=0.5)
            print(len(SectionalText))
            print("Summary -->")
            print(text)
            if (len(text) == 0):
                text = summarize('Test', SectionalText, count=4)
                text = "\n".join(text)
                text = summarizer.summarize(text, ratio=0.6)
            text = re.sub("\n", "", text)
            f.write(text)
        f.close()
        logging.info("Summary Created")
        logging.info("Exiting SummaryCreator")

Example #10

0

Show file

File: a2.py Project: lazywolf007/Intelligent-Document-Finder

def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    sample = text

    stop_words = set(stopwords.words('english'))#stopwords

    word_tokens = word_tokenize(sample)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    new_sentence = ' '.join(filtered_sentence)
    print(keywords.keywords(text))#original keywords
    txt=summarizer.summarize(text,ratio=0.2)#summary of original summary(SAY NEW)
    print(txt)
    print(summarizer.summarize(txt,ratio=0.2))
    summarylist=summarizer.summarize(text,split=True)#list of new
    summarystring=" ".join(summarylist)#string of NEW
    
    b=keywords.keywords(summarystring,split=True)#keywords obtained from string of NEW

    keywordlist1=keywords.keywords(text,split=True)#list of keywords
    keywordstring=" ".join(keywordlist1)#string of keywords
    keywordlist2=keywords.keywords(keywordstring,split=True)
    tokens= nltk.word_tokenize(keywordstring)
    print({stem(t) for t in tokens})      # gives set of unique kywords
    print(len({stem(t) for t in tokens})) # gives number of unique kywords


    return text

Example #11

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_text_summarization_on_single_input_sentence_with_split_is_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text, split=True), [])

Example #12

0

Show file

File: resumidor_estatal.py Project: pastine/resumidor-estatal

def summarize_news(parent_comment_body):
    summary = '>'  # Summary must be quoted
    summary += summarizer.summarize(parent_comment_body, language='spanish')
    summary = summary.replace(
        '\n', '\n>\n>'
    )  # Reddit uses markdown, where paragraphs are divided by two breaklines
    return summary

Example #13

0

Show file

File: summa.py Project: hocpoeng/textsummary

def viewSumma(request):
  url_topull = request.GET.get('url', 'https://en.wikipedia.org/wiki/Machine_learning')

  scraped_data = urllib.request.urlopen(url_topull)  
  article = scraped_data.read()

  parsed_article = bs.BeautifulSoup(article,'lxml')

  paragraphs = parsed_article.find_all('p')

  article_text = ""

  for p in paragraphs:  
      article_text += p.text

  summary = summarizer.summarize(article_text,ratio=0.05)

  print("Data pull done")

  print("==================================SUMMARY===================================")
  print (summary)

  print("==================================KEYWORDS===================================")
  print (keywords.keywords(article_text,ratio=0.5))

  context = {
      'title': 'Summa',
      'summary': [summary],
  }
  return render(request, 'home.html', context)

Example #14

0

Show file

File: ResumenService.py Project: rfmorera/mineria_v2

    def summarize_by_ids(cls,
                         ids,
                         opinion_type,
                         ratio=0.3,
                         words=None,
                         keywords_words=None):

        if opinion_type:
            op_list = OpinionService.get_by_ids(ids)
        else:
            op_list = EntradaService.get_by_ids(ids)

        for op in op_list:
            op.resumen = summarizer.summarize(op.content,
                                              language='spanish',
                                              ratio=ratio,
                                              words=words)
            op.keywords = keywords.keywords(op.content,
                                            language='spanish',
                                            split=True,
                                            ratio=ratio,
                                            words=keywords_words)
            if op.keywords == "":
                op.keywords = "Opss!! No ha sido posible extraer palabras claves."

            if op.resumen == "":
                op.resumen = "El texto es breve o no se ha podido generar un resumen. Lo sentimos."

        OpinionService.save_opinions(op_list)

        return op_list

Example #15

0

Show file

File: test_summarizer.py Project: zllstr/textrank

    def test_text_summarization_on_single_input_sentence_with_split_is_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text, split=True), [])

Example #16

0

Show file

    def process_document(self, doc: Document,
                         text_process_params: TextProcessParams):
        doc.keywords.clear()
        doc.entities.clear()

        parse_sentences(doc)
        main_lang = doc.main_lang
        summa_lang = self._land_dic[main_lang]

        # Extract keywords
        text = doc.text
        kw_num = text_process_params.keywords_number
        if kw_num <= 0:
            kw_num = 10
        kwrds = keywords.keywords(text, words=kw_num, language=summa_lang)

        # Extract summary
        ratio = text_process_params.summary_size.calculate_ratio(
            len(doc.sentences))
        summary = summarizer.summarize(text, ratio=ratio, language=summa_lang)
        # print(kwrds)
        # print(summary)
        for kw in kwrds.splitlines():
            doc.keywords.append(Entity(kw, EntityKind.KEYWORD))
        for s in summary.splitlines():
            doc.summary.append(Entity(s, EntityKind.SUMMARY_SENTENCE))

Example #17

0

Show file

File: data_helper.py Project: shelleyHLX/Seq2CNN

def gen_summary(text, max_length):
    """Clean sentence"""
    sentence = summarize(text)
    bow = sentence
    bow = bow.lower()
    bow = bow.split()
    #bow = bow + keywords(text,split = True)
    bow = bow + text.lower().split()
    new_text = []
    contractions = get_contractions()
    for word in bow:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = text.split(' ')
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = ['GO'] + text
    text = empty_remover(text)
    if len(text) >= max_length:
        text = text[0:max_length]
    else:
        text = text + ["PAD"] * (max_length - len(text))
        text = text[0:max_length]
    return ' '.join(text)

Example #18

0

Show file

File: extractor.py Project: amardaxini/diffbot_app

def extract(url):
    article = Article(url=url, config=config)
    article.download()
    article.parse()
    article.nlp()

    try:
        text_rank_summary = summarizer.summarize(article.text)
    except:
        text_rank_summary = ''

    return dict(
        title=article.title,
        image=article.top_image,
        authors=article.authors,
        tags=article.keywords,
        text_rank_summary=text_rank_summary,
        summary=article.summary,
        description=article.text,
        publish_date=article.publish_date,
        meta_img=article.meta_img,
        meta_description=article.meta_description,
        meta_keywords=article.meta_keywords,

        

    )

Example #19

0

Show file

File: make_pseudo_extractive_data.py Project: Jinhyeong-Lim/text_summarization

def textrank():
    """
    TextRank 전략을사용해 임시 추출 요약문 생성
    """

    textrank_data = []
    with open('summary_data.json', "r") as json_file:
        summary = json.load(json_file)
        summary = summary["data"]
        for d in summary:
            # 해당 기사 id Parsing
            doc_id = str(d["document_id"]).strip().split(".")

            # 해당 기사 본문 Parsing
            with open('./news/data/' + str(doc_id[0])+".json") as \
                    j_file:
                origin_data = json.load(j_file)
                for data in origin_data["document"]:
                    if data["id"] == d["document_id"]:
                        paragraph = data["paragraph"]
                        preprocess_text = " ".join(
                            str(sentence["form"]) for sentence in paragraph)
                        textrank_summary = summarizer.summarize(
                            preprocess_text, ratio=0.2)
                        textrank_data.append(
                            (preprocess_text, textrank_summary))
                        break

    return textrank_data

Example #20

0

Show file

File: test_summarizer.py Project: pdmorale/Summarization

    def test_text_summarization_on_short_input_text_is_not_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(summarize(text), "")

Example #21

0

Show file

def create_another(max_chars):
    for filename in os.listdir(TREES_SOURCE_PATH):
        chars = 0
        if filename.endswith('.xml'):
            name = re.search('topic_(.*)\.xml', filename)
            path = SUM_PATH + name.group(1)
            input_text = ''
            nugget_data = get_nuggets_from_file(name.group(1))
            xmldoc = ET.parse(TREES_SOURCE_PATH + 'topic_' + name.group(1) +
                              '.xml')
            root = xmldoc.getroot()

            for nugget in root.iter('Nugget'):
                input_text = input_text + ' ' + nugget_data.get(
                    nugget.get('id'))[0]

            with open(path + '_Group5.txt', 'w', encoding='utf8') as summary:
                summary.write("====================== General Summary of " +
                              name.group(1) + " ======================\n")
                s = summarizer.summarize(input_text, words=90, split=True)

                for sentence in s:

                    if (chars + len(sentence) <= max_chars):
                        summary.write('\n' + sentence)
                        chars += len(sentence)

Example #22

0

Show file

File: makeRecord.py Project: Leyan529/Master-Summarizer

def textrank_summ_keys(text):
    keywords2 = list()
    for words in summarize(text, words=8).split('\n'):
        keywords2.extend(words.split(" "))
    keywords2 = set(keywords2)

    return " ".join(list(keywords2))

Example #23

0

Show file

File: textrank_summarizer.py Project: jackg0/CS5984_team1

def generateTextRankSummary(json_file):

    t0 = time.time()

    with open(json_file) as file:
        data = json.load(file)

    noise = ['People', 'Photo:', 'More:', 'Subscribe', 'SUBSCRIBE', 'SHARE', 'Tags', 'RELATED:', 'MORE:', '100-year floodplain', 'Image']
    sentences_list = ['' if any([term in articles['Sentences'] for term in noise]) else articles['Sentences'] for articles in data]

    # Reducing data set size:
    # sentences_list = sentences_list[:len(sentences_list)]
    print('Number of sentences:', len([sentence for sentence in sentences_list if sentence]))
    sentences = (' ').join(sentences_list)

    print('Reading JSON file - time elapsed:', time.time() - t0)

    url = 'All articles'
    text = sentences
    print('----------------------------------')
    t0 = time.time()
    output_sentences = summarizer.summarize(sentences, words=100, scores=True)
    output_sentences = sorted(output_sentences, key=lambda x : x[1], reverse=True)
    for idx, output in enumerate(output_sentences):
        print('Sentence number:', idx, 'Score:', output[1])
        print(output[0])
    print('----------------------------------')
    print('Summary complete - time elapsed:', time.time() - t0)

Example #24

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_text_summarization_on_short_input_text_with_split_is_not_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(summarize(text, split=True), [])

Example #25

0

Show file

File: keyword.py Project: DrawingProcess/ThanksForToday

def get_key(sentence):
    key_sent = summarize(sentence, ratio=0.1)   # ratio는 전체 문장 수에 비례하여 추출할 핵심 문장의 비율
    key_word = keywords.keywords(sentence, words=15)  # 핵심 단어 15개 추출

    key_sent_noun = morpheme(key_sent)  # 핵심 문장에서 명사만 추출
    key_word_noun = morpheme(key_word)  # 핵심 단어에서 명사만 추출
    #print("key_sent_noun : ", key_sent_noun)
    #print("key_word_noun : ", key_word_noun)


    # 핵심 문장의 명사 단어 중 핵심 단어의 명사 단어와 겹치는 것의 빈도 수 측정
    word_cnt = {}
    for noun in key_sent_noun:
        if noun in key_word_noun:
            if noun in word_cnt.keys():
                word_cnt[noun] += 1
            else:
                word_cnt[noun] = 1

    # 빈도 수가 높은 순서로 정렬
    sorted_word_cnt = sorted(word_cnt.items(), reverse=True, key=lambda item: item[1])  # 리스트
    #print("sorted : ", sorted_word_cnt)

    num = min(5, len(sorted_word_cnt))      # 핵심 단어는 최대 5개
    final_key_word = []
    final_key_word.append([sorted_word_cnt[i][0] for i in range(num)])
    final_key_word = final_key_word[0]
    #print("final: ", final_key_word)

    final_key_sentence = key_sent.split('.')[0]     # 핵심 문장은 1개

    return final_key_sentence, final_key_word

Example #26

0

Show file

File: spark_abstract_extractor.py Project: nausicaasnow/ReputationFirst

def get_result_list(lines):
    from summa import summarizer
    from summa import keywords
    import nltk
    nltk.download('punkt', download_dir='./nltk_data')
    nltk.data.path.append("./nltk_data")
    result_list = []
    for line in lines:
        json_data = json.loads(line)
        text = json_data["text"]
        abstract = summarizer.summarize(text)
        keyword = keywords.keywords(text, split=True)
        name = json_data['company']
        tags_words = list(map(lambda x: x[1:], json_data['tags']))
        abstract_words = list(
            map(lambda x: x.lower(), nltk.tokenize.word_tokenize(abstract)))
        title_words = list(
            map(lambda x: x.lower(),
                nltk.tokenize.word_tokenize(json_data['title'])))
        if abstract != '' and name not in abstract_words and name not in title_words and name not in tags_words:
            continue
        json_data['abstract'] = abstract
        json_data['keywords'] = keyword
        result_list.append(json.dumps(json_data))
    return result_list

Example #27

0

Show file

File: get_metrics.py Project: poteminr/vk_hackathon

def get_embeddings(model, text_list: np.array, summarize=True, it_user=False):

    if it_user:
        text = text_list[0]
        text_list = text_list[1:]

    else:
        text = ""

    for part in text_list:
        if summarize:
            summary_res = summarizer.summarize(
                part,
                language="russian",
            )
            if summary_res != "":
                text += summary_res
            else:
                text += part
        else:
            text += part

    vector = model.get_sentence_vector(text)

    return vector

Example #28

0

Show file

File: textrank_analyzer.py Project: TalTech-Analytics/discord-data-aggregator

 def add_summary(add, row):
     try:
         add["summary"] = summarize(row["text"],
                                    words=50,
                                    language='finnish')
     except Exception:
         add["summary"] = ""

Example #29

0

Show file

    def generate(self, demand, candidates):
        from summa import summarizer
        result = {"demand": demand, "candidates": candidates}
        if len(candidates) > 0:
            cur_time = time.time()

            sentences = candidates["sentence"].tolist()
            totalDemand = sum(demand.values())
            totalSentences = len(sentences)
            ratio = 1.0 * totalDemand / totalSentences + EPS
            summary = summarizer.summarize(" . ".join(sentences), ratio=ratio)
            sentences = [
                s.strip() for s in summary.strip().split(".") if len(s) > 0
            ]

            result["selected_sentences"] = []
            result["selected_aspects"] = []
            result["selected_reviews"] = []
            for _, row in candidates.iterrows():
                if row["sentence"] in sentences:
                    if result["selected_sentences"].count(
                            row["sentence"]) >= sentences.count(
                                row["sentence"]):
                        continue
                    result["selected_sentences"].append(row["sentence"])
                    result["selected_aspects"].append(row["aspect"])
                    if row["id"] not in result["selected_reviews"]:
                        result["selected_reviews"].append(row["id"])
            result["solve_time"] = time.time() - cur_time

        return result

Example #30

0

Show file

File: main.py Project: Isprinsessan/Reddit_bot_ML

def textrank(text):
    textString = ""
    for comment in text:
        if (comment != "[removed]" and comment != "[deleted]"):
            textString += " " + comment
    message = summarize(textString, ratio=0.1)
    return message

Example #31

0

Show file

File: enrichDB.py Project: AdilRaza02/Boom-1

def extract_text_info(text):
    sentences = summarizer.summarize(text, ratio=0.2)
    sentences = sentences.replace("""\n""", "")
    time_chars = max(int(round(len(sentences.split()) / 200, 0)), 1)
    sentences = sentences.split(".")
    highlight_lines = '$$'.join([str(x) for x in sentences])
    return highlight_lines, time_chars

Example #32

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_text_summarization_on_single_input_sentence_is_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text), "")

Example #33

0

Show file

def get_content(link):
    page = requests.get(link)
    soup = BeautifulSoup(page.content, "html.parser")

    content = soup.find(class_="main-content")

    headline = content.find("h1", {"itemprop": "headline"}).get_text()
    author = content.find("span", {"itemprop": "author"}).get_text()
    when = content.find(class_="p-published-time").get_text()

    text = content.find("div", {"itemprop": "articleBody"})
    paragraphs = []

    for paragraph in text.find_all("p"):
        paragraphs.append(paragraph.get_text()[1:])

    formatted = format(paragraphs)

    content = "```"

    content += "-" * 72 + "\n"
    content += "HEADLINE: " + headline + "\nAUTHOR: " + author + "\nWHEN: " + when + "\n\n"
    content += "FULL ARTICLE:\n"
    content += formatted + "\n"
    content += "SUMMARY:\n"
    content += summarizer.summarize(
        formatted.replace("\n", " ").replace("-", "")) + "\n"

    content += "-" * 72 + "\n\n```"

    return content

Example #34

0

Show file

File: test_summarizer.py Project: pdmorale/Summarization

    def test_text_summarization_on_single_input_sentence_is_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text), "")

Example #35

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_reference_text_summarization_wstopwords(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")
        additional_stoplist = get_text_from_test_data("mihalcea_tarau.sw.txt").strip().split(",")
        # Makes a summary of the text.
        generated_summary = summarize(text,additional_stopwords=additional_stoplist)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")

        self.assertEqual(generated_summary, summary)

Example #36

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_reference_text_summarization(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Makes a summary of the text.
        generated_summary = summarize(text)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")

        self.assertEqual(generated_summary, summary)

Example #37

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_reference_text_summarization_with_split(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Makes a summary of the text as a list.
        generated_summary = summarize(text, split=True)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")
        summary = summary.split("\n")

        self.assertSequenceEqual(generated_summary, summary)

Example #38

0

Show file

File: test_summarizer.py Project: summanlp/textrank

    def test_corpus_summarization_ratio(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        sentences = text.split('\n')

        # Makes summaries of the text using different ratio parameters.
        for x in range(1, 10):
            ratio = x / float(10)
            selected_sentences = summarize(text, ratio=ratio, split=True)
            expected_summary_length = int(len(sentences) * ratio)

            self.assertEqual(len(selected_sentences), expected_summary_length)

Example #39

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_summary_from_unrelated_sentences_and_split_is_not_empty_list(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = get_text_from_test_data("unrelated.txt")
     self.assertNotEqual(summarize(text, split=True), [])

Example #40

0

Show file

File: GuiClasificador-V3.5.py Project: josearcosaneas/RepositorioPara-la-entrega-del-TFG

 def resumir(self, texto, lenguaje="spanish", ratio=0.25):
     if not (lenguaje):
         return summarizer.summarize(texto, language="spanish", ratio=ratio)
     else:
         return summarizer.summarize(texto, language=lenguaje)

Example #41

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_empty_text_summarization_with_split_is_empty_list(self):
     self.assertEqual(summarize("", split=True), [])

Example #42

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_arabic(self):
      # Test the summarization module for arabic language.
     text = get_text_from_test_data("arabic.txt")
     self.assertIsNotNone(summarize(text, language='arabic'))

Example #43

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_polish(self):
     # Test the summarization module for Polish language.
     text = get_text_from_test_data("polish.txt")
     self.assertIsNotNone(summarize(text, language="polish"))

Example #44

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_few_distinct_words_summarization_wstopwords_with_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stoplist = ["here","there"]
     self.assertEqual(summarize(text, split=True, additional_stopwords=additional_stoplist), [])

Example #45

0

Show file

File: l1_TextRank_summanlp.py Project: coder352/shellscript

#!/usr/bin/python3
# coding: utf-8
# pip install summa
text = """Automatic summarization is the process of reducing a text document with a
          computer program in order to create a summary that retains the most important points
          of the original document. As the problem of information overload has grown, and as
          the quantity of data has increased, so has interest in automatic summarization.
          Technologies that can make a coherent summary take into account variables such as
          length, writing style and syntax. An example of the use of summarization technology
          is search engines such as Google. Document summarization is another."""

from summa import summarizer
res = summarizer.summarize(text); print(type(res), res)  # <class 'str'>; 抽取句子
# 'Automatic summarization is the process of reducing a text document with a computer
# program in order to create a summary that retains the most important points of the
# original document.'
sum_sens = summarizer.summarize(text, split=True); print(sum_sens)  # output as list of sentence
# ['Automatic summarization is the process of reducing a text document with a', 'Document summarization is another.']

## for sentences
from summa.preprocessing.textcleaner import clean_text_by_sentences
res = clean_text_by_sentences(text); print(res, type(res))  # <class 'list'>
# [Original unit: 'Automatic summarization is the process of reducing a text document with a' *-*-*-* Processed unit: 'automat summar process reduc text document', Original unit: 'computer program in order to create a summary that retains the most important points' *-*-*-* Processed unit: 'program order creat summari retain import point', Original unit: 'of the original document.' *-*-*-* Processed unit: 'origin document', Original unit: 'As the problem of information overload has grown, and as' *-*-*-* Processed unit: 'problem inform overload grown', Original unit: 'the quantity of data has increased, so has interest in automatic summarization.' *-*-*-* Processed unit: 'quantiti data increas automat summar', Original unit: 'Technologies that can make a coherent summary take into account variables such as' *-*-*-* Processed unit: 'technolog make coher summari account variabl', Original unit: 'length, writing style and syntax.' *-*-*-* Processed unit: 'length write style syntax', Original unit: 'An example of the use of summarization technology' *-*-*-* Processed unit: 'exampl use summar technolog', Original unit: 'is search engines such as Google.' *-*-*-* Processed unit: 'search engin googl', Original unit: 'Document summarization is another.' *-*-*-* Processed unit: 'document summar']
print([sentence.token for sentence in res])  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']
sentences = [sentence.text for sentence in res]; print(sentences)  # original text
# ['Automatic summarization is the process of reducing a text document with a', 'computer program in order to create a summary that retains the most important points', 'of the original document.', 'As the problem of information overload has grown, and as', 'the quantity of data has increased, so has interest in automatic summarization.', 'Technologies that can make a coherent summary take into account variables such as', 'length, writing style and syntax.', 'An example of the use of summarization technology', 'is search engines such as Google.', 'Document summarization is another.']
print([sentences.index(sen) for sen in sum_sens])  # [0, 9]
graph = summarizer.get_graph(text)
print(graph.nodes())  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']

## keywords

Example #46

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_few_distinct_words_summarization_with_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(summarize(text, split=True), [])

Example #47

0

Show file

File: parse_extracts.py Project: UCLA-BD2K/AztecRetrieval

def summarize(sentences, pub):
    try:
        pub.summary = summarizer.summarize(sentences, words=100)
    except Exception as e:
        print "Line number " + get_linenumber()
        print e

Example #48

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_summary_from_unrelated_sentences_is_not_empty_string(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = get_text_from_test_data("unrelated.txt")
     self.assertNotEqual(summarize(text), "")

Example #49

0

Show file

File: codigo.py Project: josearcosaneas/Pruebas-Multilabel

def resumir(texto,lenguaje='spanish'):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish')
    else:
        return summarizer.summarize(texto, language=lenguaje)

Example #50

0

Show file

File: ClasificadorResumen.py Project: josearcosaneas/RepositorioPara-la-entrega-del-TFG

def resumir(texto,lenguaje='spanish',ratio=0.2):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish',ratio=ratio)
    else:
        return summarizer.summarize(texto, language=lenguaje,ratio=ratio)

Example #51

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_empty_text_summarization_is_empty_string(self):
     self.assertEqual(summarize(""), "")

Example #52

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_few_distinct_words_summarization_wstopwords_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stoplist = ["here","there"]
     self.assertEqual(summarize(text, additional_stopwords=additional_stoplist), "")

Example #53

0

Show file

File: ClasificadorResumen.py Project: josearcosaneas/RepositorioPara-la-entrega-del-TFG

            resultList.extend([elements[i].childNodes[0].nodeValue])
            materias.append(resultList[i])

target_names=GeneraTarget(materias)  
############################################################################
############################################################################
def leerTags(path,tag):
    midom=parse(path)
    elements = midom.getElementsByTagName(tag)
    resultList1 = []

    if len(elements) > 0:
        for i in range(0,len(elements)):
            resultList1.extend([elements[i].childNodes[0].nodeValue])
    return resultList1
"""
summarizer.summarize(texto,lenguaje,ratio,words)
"""    
def resumir(texto,lenguaje='spanish',ratio=0.2):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish',ratio=ratio)
    else:
        return summarizer.summarize(texto, language=lenguaje,ratio=ratio)
############################################################################
############################################################################   
        
 
"""
Funcion encargada de transformar materias para su entrada 
"""
def transformaMaterias(materiasF):

Example #54

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_spanish(self):
     # Test the summarization module with accented characters.
     text = get_text_from_test_data("spanish.txt")
     self.assertIsNotNone(summarize(text, language="spanish"))

Example #55

0

Show file

File: base_gen_summa.py Project: kariminf/AllSummarizer

    sys.setdefaultencoding('utf8')
    """
    nltk.data.path.append('/home/kariminf/Data/NLTK/')



    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    """

    file = open(SIZE_FILE, 'r')
    while 1:
        line = file.readline()
        if line == '':
			break;
        parts = line.split(",")
        sizes[parts[0]] = int(parts[1])
    file.close()
    

    #nltk.data.path.append('/home/kariminf/Data/NLTK/')
    for eval in sizes:
		txt_path = "src/body/text/en/" + eval
		print(txt_path)
		text = readTextFile(txt_path)
		sentences = summarize(text,language=LANGUAGE, split=True)
		summary = extract(sentences, sizes[eval])
		fout = open("baselines/summa_textrank/en/" + eval[:-9] + ".txt", "w")
		fout.write(summary)
		fout.close()

Example #56

0

Show file

File: test_summarizer.py Project: summanlp/textrank

 def test_few_distinct_words_summarization_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(summarize(text), "")