def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         summarize(bytes, language="spanish")
def text_summarize():
    neg = "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/test/neg/"
    pos = "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/test/pos/"
    outfile = open(
        "C:/Users/Dante/PycharmProjects/Compiler/Datasets/aclImdb/train/res.csv",
        "w+",
        newline='')
    count = 0

    for files in glob.glob(neg + "*.txt"):
        count = count + 1
        infile = open(files, errors='ignore')
        text = infile.read()
        res = summarize(text, ratio=0.2)
        temp = ""
        for line in res:
            temp += line.rstrip('\n')
        CSVWriter = csv.writer(outfile)
        CSVWriter.writerow(['0', str(temp)])  #0 = Negative Review

    for files in glob.glob(pos + "*.txt"):
        count = count + 1
        infile = open(files, errors='ignore')
        text = infile.read()
        res = summarize(text, ratio=0.2)
        temp = ""
        for line in res:
            temp += line.rstrip('\n')
        CSVWriter = csv.writer(outfile)
        CSVWriter.writerow(
            ['1', str(temp)]
        )  #1 = Positive Review #This writes a row with label and summarized review to a csv file
Example #3
1
def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get(
            'https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2'
        )
        response_dictionary = {}
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0
        for item in data["articles"]:
            mid_dictionary = {}
            if (type(item["title"]) == "unicode"):
                ti = unicodedata.normalize('NFKD', item["title"]).encode(
                    'ascii', 'ignore')
            else:
                ti = item["title"]
            n = len(ti)
            if (n > 10):
                mid_dictionary["title"] = ti
                sc = Scraper()
                print item
                mid_dictionary["data"] = sc.scrape_mirror(item["url"])
                article_data.append(mid_dictionary)
                mid_dictionary["image"] = (item["urlToImage"])
                print mid_dictionary["image"]
                if (type(mid_dictionary["data"]) == "unicode"):
                    st = unicodedata.normalize('NFKD',
                                               mid_dictionary["data"]).encode(
                                                   'ascii', 'ignore')
                else:
                    st = mid_dictionary["data"]

                try:

                    print "********Summary******"
                    summary = summarizer.summarize(st, words=50)
                    print summary.encode('ascii', 'ignore')
                    print "---------Summary---------"
                    mid_dictionary["summary"] = summarizer.summarize(st,
                                                                     words=50)
                except ZeroDivisionError:
                    mid_dictionary["summary"] = st

                print mid_dictionary
                main_dict["article_" + str(i)] = mid_dictionary
                i = i + 1

        response_dictionary["articles"] = article_data

        print main_dict
        for key, value in main_dict.iteritems():
            print "***************"

        return render(request, 'accounts/dashboard.html',
                      {'main_dict': main_dict})
    else:
        return HttpResponseRedirect("/login")
Example #4
0
 def test_text_as_bytes_raises_exception(self):
     # Test the keyword extraction for a text that is not a unicode object
     # (Python 3 str).
     text = get_text_from_test_data("spanish.txt")
     bytes = text.encode(encoding="utf-8")
     with self.assertRaises(ValueError):
         summarize(bytes, language="spanish")
Example #5
0
def Summarizer(text, ratio, word):
    language = detect(text)
    if word == "Disabled":
        if language == "en":
            from summa.summarizer import summarize
            smzTxt = summarize(text, ratio=float(ratio))
        else:
            from textrank.summa import summarizer
            smzTxt = summarizer.summarize(text, ratio=float(ratio))
    else:
        if language == "en":
            from summa.summarizer import summarize

            smzTxt = summarize(text, words=int(word))
        else:
            from textrank.summa import summarizer
            smzTxt = summarizer.summarize(text, words=int(word))

    outputText = ",".join(smzTxt).replace(",", "")
    if outputText == "":
        return None
    # with codecs.open("temp.txt", 'w', 'utf-8', 'ignore') as f:
    # 	f.write(outputText)
    # 	f.close()
    # f = open("temp.txt", "w").close()
    # 	f = open("temp.txt", "w+")
    return outputText
Example #6
0
def dashboard(request):
    if request.user.is_authenticated():
        r = requests.get('https://newsapi.org/v1/articles?source=mirror&apiKey=6df0769e0d6244aaa00768c02f123fb2')
        response_dictionary = {}
        # response_dictionary["status"] = r
        response_dictionary["source"] = "mirror"
        data = json.loads(r.text)
        # print data["articles"]
        list_of_urls = []
        article_data = []
        main_dict = {}
        i = 0;
        for item in data["articles"]:
            mid_dictionary = {}
            mid_dictionary["title"] = item["title"]
            #mid_dictionary["url"] = item["url"]
            #mid_dictionary["urlToImage"] = item["urlToImage"]
            sc = Scraper()
            mid_dictionary["data"] = sc.scrape_mirror(item["url"])
            article_data.append(mid_dictionary)
            print type(mid_dictionary["data"])
            if(type(mid_dictionary["data"])=="unicode"):
                st = unicodedata.normalize('NFKD', mid_dictionary["data"]).encode('ascii','ignore')
            else:
                st = mid_dictionary["data"]
            if(len(st)>250):
                print "********Summary******"
                print summarizer.summarize(st,words=50)
                print "---------Summary---------"
                mid_dictionary["summary"] = summarizer.summarize(st,words=50)
            else :
                mid_dictionary["summary"] = st

            mid_dictionary["article_"+str(i)] = mid_dictionary
            i = i+1
        
        response_dictionary["articles"] = article_data
        
        # news_dictionary = {}
        #print main_dict
        for key,value in main_dict.iteritems():
            print "***************"
            # print key
            
            
            print value["title"]
        # # print news_dictionary
        # list_of_urls = []
        # return render(request, 'newslist.html', {'newslist':response_dictionary})
        return render(request,
                'accounts/dashboard.html',
                {'main_dict': main_dict}
                )
    else:
         return HttpResponseRedirect("/login")
Example #7
0
def my_form_post1():
    summ = ""
    if len(request.get_json(silent=True)) > 0:
        data = request.get_json(silent=True)
        text = data.get('fileText')
        method = data.get('method')
        if method == 1:
            summ = summarizer.summarize(text)
        elif method == 2:
            summ = summarizer.summarize(text, ratio=0.4)

    return Response(summ)
Example #8
0
def summarize(text, file=False, output_file=None):
	summary = ''
	if(file == True):
		f = open(text).read()
		summary = summarizer.summarize(f, words = 50)
	else:
		summary = summarizer.summarize(text, words = 50)

	if(output_file is not None):
		f = open(output_file, 'w')
		f.write(summary)
		f.close()

	return summary
Example #9
0
    def SummaryTheText(self, SectionalText, CallNumber=0):
        '''
        ----------
        Function
        ----------
        * recievs sectional text
        * Cleans the sectional text
        * Summarise based on text size
        * Saves the summary to Summary.txt file for each section
        
        --------
        INPUT
        --------
        SectionalText = Section Vise text for summarization
        CallNumber = Used to iterate over differend files differentiated based on
                     number (Default = 0)
        
        -------
        RETURN
        -------
        None
        
        '''
        OutputFileNamePAth = 'Summary' + str(CallNumber) + '.txt'
        logging.info("Inside SummaryCreator")
        logging.info("Creating Summary by opening Summary%s.txt", CallNumber)
        with open(OutputFileNamePAth, 'w') as f:

            text = re.sub("∑", " \n", SectionalText)
            text = unidecode(text)
            text = re.sub(r'[^\x00-\x7F]+', '', text)

            if (len(text) <= 300):
                text = summarize('Test', text, count=4)
                text = "\n".join(text)
                text = re.sub('IntroductionThis', 'Introduction This', text)
            else:
                text = summarizer.summarize(text, ratio=0.5)
            print(len(SectionalText))
            print("Summary -->")
            print(text)
            if (len(text) == 0):
                text = summarize('Test', SectionalText, count=4)
                text = "\n".join(text)
                text = summarizer.summarize(text, ratio=0.6)
            text = re.sub("\n", "", text)
            f.write(text)
        f.close()
        logging.info("Summary Created")
        logging.info("Exiting SummaryCreator")
def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    sample = text

    stop_words = set(stopwords.words('english'))#stopwords

    word_tokens = word_tokenize(sample)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    new_sentence = ' '.join(filtered_sentence)
    print(keywords.keywords(text))#original keywords
    txt=summarizer.summarize(text,ratio=0.2)#summary of original summary(SAY NEW)
    print(txt)
    print(summarizer.summarize(txt,ratio=0.2))
    summarylist=summarizer.summarize(text,split=True)#list of new
    summarystring=" ".join(summarylist)#string of NEW
    
    b=keywords.keywords(summarystring,split=True)#keywords obtained from string of NEW

    keywordlist1=keywords.keywords(text,split=True)#list of keywords
    keywordstring=" ".join(keywordlist1)#string of keywords
    keywordlist2=keywords.keywords(keywordstring,split=True)
    tokens= nltk.word_tokenize(keywordstring)
    print({stem(t) for t in tokens})      # gives set of unique kywords
    print(len({stem(t) for t in tokens})) # gives number of unique kywords


    return text
Example #11
0
    def test_text_summarization_on_single_input_sentence_with_split_is_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text, split=True), [])
def summarize_news(parent_comment_body):
    summary = '>'  # Summary must be quoted
    summary += summarizer.summarize(parent_comment_body, language='spanish')
    summary = summary.replace(
        '\n', '\n>\n>'
    )  # Reddit uses markdown, where paragraphs are divided by two breaklines
    return summary
Example #13
0
def viewSumma(request):
  url_topull = request.GET.get('url', 'https://en.wikipedia.org/wiki/Machine_learning')

  scraped_data = urllib.request.urlopen(url_topull)  
  article = scraped_data.read()

  parsed_article = bs.BeautifulSoup(article,'lxml')

  paragraphs = parsed_article.find_all('p')

  article_text = ""

  for p in paragraphs:  
      article_text += p.text

  summary = summarizer.summarize(article_text,ratio=0.05)

  print("Data pull done")

  print("==================================SUMMARY===================================")
  print (summary)

  print("==================================KEYWORDS===================================")
  print (keywords.keywords(article_text,ratio=0.5))

  context = {
      'title': 'Summa',
      'summary': [summary],
  }
  return render(request, 'home.html', context)
Example #14
0
    def summarize_by_ids(cls,
                         ids,
                         opinion_type,
                         ratio=0.3,
                         words=None,
                         keywords_words=None):

        if opinion_type:
            op_list = OpinionService.get_by_ids(ids)
        else:
            op_list = EntradaService.get_by_ids(ids)

        for op in op_list:
            op.resumen = summarizer.summarize(op.content,
                                              language='spanish',
                                              ratio=ratio,
                                              words=words)
            op.keywords = keywords.keywords(op.content,
                                            language='spanish',
                                            split=True,
                                            ratio=ratio,
                                            words=keywords_words)
            if op.keywords == "":
                op.keywords = "Opss!! No ha sido posible extraer palabras claves."

            if op.resumen == "":
                op.resumen = "El texto es breve o no se ha podido generar un resumen. Lo sentimos."

        OpinionService.save_opinions(op_list)

        return op_list
Example #15
0
    def test_text_summarization_on_single_input_sentence_with_split_is_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text, split=True), [])
Example #16
0
    def process_document(self, doc: Document,
                         text_process_params: TextProcessParams):
        doc.keywords.clear()
        doc.entities.clear()

        parse_sentences(doc)
        main_lang = doc.main_lang
        summa_lang = self._land_dic[main_lang]

        # Extract keywords
        text = doc.text
        kw_num = text_process_params.keywords_number
        if kw_num <= 0:
            kw_num = 10
        kwrds = keywords.keywords(text, words=kw_num, language=summa_lang)

        # Extract summary
        ratio = text_process_params.summary_size.calculate_ratio(
            len(doc.sentences))
        summary = summarizer.summarize(text, ratio=ratio, language=summa_lang)
        # print(kwrds)
        # print(summary)
        for kw in kwrds.splitlines():
            doc.keywords.append(Entity(kw, EntityKind.KEYWORD))
        for s in summary.splitlines():
            doc.summary.append(Entity(s, EntityKind.SUMMARY_SENTENCE))
Example #17
0
def gen_summary(text, max_length):
    """Clean sentence"""
    sentence = summarize(text)
    bow = sentence
    bow = bow.lower()
    bow = bow.split()
    #bow = bow + keywords(text,split = True)
    bow = bow + text.lower().split()
    new_text = []
    contractions = get_contractions()
    for word in bow:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = text.split(' ')
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = ['GO'] + text
    text = empty_remover(text)
    if len(text) >= max_length:
        text = text[0:max_length]
    else:
        text = text + ["PAD"] * (max_length - len(text))
        text = text[0:max_length]
    return ' '.join(text)
Example #18
0
def extract(url):
    article = Article(url=url, config=config)
    article.download()
    article.parse()
    article.nlp()

    try:
        text_rank_summary = summarizer.summarize(article.text)
    except:
        text_rank_summary = ''

    return dict(
        title=article.title,
        image=article.top_image,
        authors=article.authors,
        tags=article.keywords,
        text_rank_summary=text_rank_summary,
        summary=article.summary,
        description=article.text,
        publish_date=article.publish_date,
        meta_img=article.meta_img,
        meta_description=article.meta_description,
        meta_keywords=article.meta_keywords,

        

    )
def textrank():
    """
    TextRank 전략을사용해 임시 추출 요약문 생성
    """

    textrank_data = []
    with open('summary_data.json', "r") as json_file:
        summary = json.load(json_file)
        summary = summary["data"]
        for d in summary:
            # 해당 기사 id Parsing
            doc_id = str(d["document_id"]).strip().split(".")

            # 해당 기사 본문 Parsing
            with open('./news/data/' + str(doc_id[0])+".json") as \
                    j_file:
                origin_data = json.load(j_file)
                for data in origin_data["document"]:
                    if data["id"] == d["document_id"]:
                        paragraph = data["paragraph"]
                        preprocess_text = " ".join(
                            str(sentence["form"]) for sentence in paragraph)
                        textrank_summary = summarizer.summarize(
                            preprocess_text, ratio=0.2)
                        textrank_data.append(
                            (preprocess_text, textrank_summary))
                        break

    return textrank_data
Example #20
0
    def test_text_summarization_on_short_input_text_is_not_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(summarize(text), "")
Example #21
0
def create_another(max_chars):
    for filename in os.listdir(TREES_SOURCE_PATH):
        chars = 0
        if filename.endswith('.xml'):
            name = re.search('topic_(.*)\.xml', filename)
            path = SUM_PATH + name.group(1)
            input_text = ''
            nugget_data = get_nuggets_from_file(name.group(1))
            xmldoc = ET.parse(TREES_SOURCE_PATH + 'topic_' + name.group(1) +
                              '.xml')
            root = xmldoc.getroot()

            for nugget in root.iter('Nugget'):
                input_text = input_text + ' ' + nugget_data.get(
                    nugget.get('id'))[0]

            with open(path + '_Group5.txt', 'w', encoding='utf8') as summary:
                summary.write("====================== General Summary of " +
                              name.group(1) + " ======================\n")
                s = summarizer.summarize(input_text, words=90, split=True)

                for sentence in s:

                    if (chars + len(sentence) <= max_chars):
                        summary.write('\n' + sentence)
                        chars += len(sentence)
Example #22
0
def textrank_summ_keys(text):
    keywords2 = list()
    for words in summarize(text, words=8).split('\n'):
        keywords2.extend(words.split(" "))
    keywords2 = set(keywords2)

    return " ".join(list(keywords2))
def generateTextRankSummary(json_file):

    t0 = time.time()

    with open(json_file) as file:
        data = json.load(file)

    noise = ['People', 'Photo:', 'More:', 'Subscribe', 'SUBSCRIBE', 'SHARE', 'Tags', 'RELATED:', 'MORE:', '100-year floodplain', 'Image']
    sentences_list = ['' if any([term in articles['Sentences'] for term in noise]) else articles['Sentences'] for articles in data]

    # Reducing data set size:
    # sentences_list = sentences_list[:len(sentences_list)]
    print('Number of sentences:', len([sentence for sentence in sentences_list if sentence]))
    sentences = (' ').join(sentences_list)

    print('Reading JSON file - time elapsed:', time.time() - t0)

    url = 'All articles'
    text = sentences
    print('----------------------------------')
    t0 = time.time()
    output_sentences = summarizer.summarize(sentences, words=100, scores=True)
    output_sentences = sorted(output_sentences, key=lambda x : x[1], reverse=True)
    for idx, output in enumerate(output_sentences):
        print('Sentence number:', idx, 'Score:', output[1])
        print(output[0])
    print('----------------------------------')
    print('Summary complete - time elapsed:', time.time() - t0)
Example #24
0
    def test_text_summarization_on_short_input_text_with_split_is_not_empty_list(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertNotEqual(summarize(text, split=True), [])
Example #25
0
def get_key(sentence):
    key_sent = summarize(sentence, ratio=0.1)   # ratio는 전체 문장 수에 비례하여 추출할 핵심 문장의 비율
    key_word = keywords.keywords(sentence, words=15)  # 핵심 단어 15개 추출

    key_sent_noun = morpheme(key_sent)  # 핵심 문장에서 명사만 추출
    key_word_noun = morpheme(key_word)  # 핵심 단어에서 명사만 추출
    #print("key_sent_noun : ", key_sent_noun)
    #print("key_word_noun : ", key_word_noun)


    # 핵심 문장의 명사 단어 중 핵심 단어의 명사 단어와 겹치는 것의 빈도 수 측정
    word_cnt = {}
    for noun in key_sent_noun:
        if noun in key_word_noun:
            if noun in word_cnt.keys():
                word_cnt[noun] += 1
            else:
                word_cnt[noun] = 1

    # 빈도 수가 높은 순서로 정렬
    sorted_word_cnt = sorted(word_cnt.items(), reverse=True, key=lambda item: item[1])  # 리스트
    #print("sorted : ", sorted_word_cnt)

    num = min(5, len(sorted_word_cnt))      # 핵심 단어는 최대 5개
    final_key_word = []
    final_key_word.append([sorted_word_cnt[i][0] for i in range(num)])
    final_key_word = final_key_word[0]
    #print("final: ", final_key_word)

    final_key_sentence = key_sent.split('.')[0]     # 핵심 문장은 1개

    return final_key_sentence, final_key_word
def get_result_list(lines):
    from summa import summarizer
    from summa import keywords
    import nltk
    nltk.download('punkt', download_dir='./nltk_data')
    nltk.data.path.append("./nltk_data")
    result_list = []
    for line in lines:
        json_data = json.loads(line)
        text = json_data["text"]
        abstract = summarizer.summarize(text)
        keyword = keywords.keywords(text, split=True)
        name = json_data['company']
        tags_words = list(map(lambda x: x[1:], json_data['tags']))
        abstract_words = list(
            map(lambda x: x.lower(), nltk.tokenize.word_tokenize(abstract)))
        title_words = list(
            map(lambda x: x.lower(),
                nltk.tokenize.word_tokenize(json_data['title'])))
        if abstract != '' and name not in abstract_words and name not in title_words and name not in tags_words:
            continue
        json_data['abstract'] = abstract
        json_data['keywords'] = keyword
        result_list.append(json.dumps(json_data))
    return result_list
Example #27
0
def get_embeddings(model, text_list: np.array, summarize=True, it_user=False):

    if it_user:
        text = text_list[0]
        text_list = text_list[1:]

    else:
        text = ""

    for part in text_list:
        if summarize:
            summary_res = summarizer.summarize(
                part,
                language="russian",
            )
            if summary_res != "":
                text += summary_res
            else:
                text += part
        else:
            text += part

    vector = model.get_sentence_vector(text)

    return vector
 def add_summary(add, row):
     try:
         add["summary"] = summarize(row["text"],
                                    words=50,
                                    language='finnish')
     except Exception:
         add["summary"] = ""
Example #29
0
    def generate(self, demand, candidates):
        from summa import summarizer
        result = {"demand": demand, "candidates": candidates}
        if len(candidates) > 0:
            cur_time = time.time()

            sentences = candidates["sentence"].tolist()
            totalDemand = sum(demand.values())
            totalSentences = len(sentences)
            ratio = 1.0 * totalDemand / totalSentences + EPS
            summary = summarizer.summarize(" . ".join(sentences), ratio=ratio)
            sentences = [
                s.strip() for s in summary.strip().split(".") if len(s) > 0
            ]

            result["selected_sentences"] = []
            result["selected_aspects"] = []
            result["selected_reviews"] = []
            for _, row in candidates.iterrows():
                if row["sentence"] in sentences:
                    if result["selected_sentences"].count(
                            row["sentence"]) >= sentences.count(
                                row["sentence"]):
                        continue
                    result["selected_sentences"].append(row["sentence"])
                    result["selected_aspects"].append(row["aspect"])
                    if row["id"] not in result["selected_reviews"]:
                        result["selected_reviews"].append(row["id"])
            result["solve_time"] = time.time() - cur_time

        return result
Example #30
0
def textrank(text):
    textString = ""
    for comment in text:
        if (comment != "[removed]" and comment != "[deleted]"):
            textString += " " + comment
    message = summarize(textString, ratio=0.1)
    return message
Example #31
0
def extract_text_info(text):
    sentences = summarizer.summarize(text, ratio=0.2)
    sentences = sentences.replace("""\n""", "")
    time_chars = max(int(round(len(sentences.split()) / 200, 0)), 1)
    sentences = sentences.split(".")
    highlight_lines = '$$'.join([str(x) for x in sentences])
    return highlight_lines, time_chars
Example #32
0
    def test_text_summarization_on_single_input_sentence_is_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text), "")
Example #33
0
def get_content(link):
    page = requests.get(link)
    soup = BeautifulSoup(page.content, "html.parser")

    content = soup.find(class_="main-content")

    headline = content.find("h1", {"itemprop": "headline"}).get_text()
    author = content.find("span", {"itemprop": "author"}).get_text()
    when = content.find(class_="p-published-time").get_text()

    text = content.find("div", {"itemprop": "articleBody"})
    paragraphs = []

    for paragraph in text.find_all("p"):
        paragraphs.append(paragraph.get_text()[1:])

    formatted = format(paragraphs)

    content = "```"

    content += "-" * 72 + "\n"
    content += "HEADLINE: " + headline + "\nAUTHOR: " + author + "\nWHEN: " + when + "\n\n"
    content += "FULL ARTICLE:\n"
    content += formatted + "\n"
    content += "SUMMARY:\n"
    content += summarizer.summarize(
        formatted.replace("\n", " ").replace("-", "")) + "\n"

    content += "-" * 72 + "\n\n```"

    return content
Example #34
0
    def test_text_summarization_on_single_input_sentence_is_empty_string(self):
        text = get_text_from_test_data("unrelated.txt")

        # Keeps the first sentence only.
        text = text.split('\n')[0]

        self.assertEqual(summarize(text), "")
Example #35
0
    def test_reference_text_summarization_wstopwords(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")
        additional_stoplist = get_text_from_test_data("mihalcea_tarau.sw.txt").strip().split(",")
        # Makes a summary of the text.
        generated_summary = summarize(text,additional_stopwords=additional_stoplist)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")

        self.assertEqual(generated_summary, summary)
Example #36
0
    def test_reference_text_summarization(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Makes a summary of the text.
        generated_summary = summarize(text)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")

        self.assertEqual(generated_summary, summary)
Example #37
0
    def test_reference_text_summarization_with_split(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        # Makes a summary of the text as a list.
        generated_summary = summarize(text, split=True)

        # To be compared to the method reference.
        summary = get_text_from_test_data("mihalcea_tarau.summ.txt")
        summary = summary.split("\n")

        self.assertSequenceEqual(generated_summary, summary)
Example #38
0
    def test_corpus_summarization_ratio(self):
        text = get_text_from_test_data("mihalcea_tarau.txt")

        sentences = text.split('\n')

        # Makes summaries of the text using different ratio parameters.
        for x in range(1, 10):
            ratio = x / float(10)
            selected_sentences = summarize(text, ratio=ratio, split=True)
            expected_summary_length = int(len(sentences) * ratio)

            self.assertEqual(len(selected_sentences), expected_summary_length)
Example #39
0
 def test_summary_from_unrelated_sentences_and_split_is_not_empty_list(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = get_text_from_test_data("unrelated.txt")
     self.assertNotEqual(summarize(text, split=True), [])
 def resumir(self, texto, lenguaje="spanish", ratio=0.25):
     if not (lenguaje):
         return summarizer.summarize(texto, language="spanish", ratio=ratio)
     else:
         return summarizer.summarize(texto, language=lenguaje)
Example #41
0
 def test_empty_text_summarization_with_split_is_empty_list(self):
     self.assertEqual(summarize("", split=True), [])
Example #42
0
 def test_arabic(self):
      # Test the summarization module for arabic language.
     text = get_text_from_test_data("arabic.txt")
     self.assertIsNotNone(summarize(text, language='arabic'))
Example #43
0
 def test_polish(self):
     # Test the summarization module for Polish language.
     text = get_text_from_test_data("polish.txt")
     self.assertIsNotNone(summarize(text, language="polish"))
Example #44
0
 def test_few_distinct_words_summarization_wstopwords_with_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stoplist = ["here","there"]
     self.assertEqual(summarize(text, split=True, additional_stopwords=additional_stoplist), [])
#!/usr/bin/python3
# coding: utf-8
# pip install summa
text = """Automatic summarization is the process of reducing a text document with a
          computer program in order to create a summary that retains the most important points
          of the original document. As the problem of information overload has grown, and as
          the quantity of data has increased, so has interest in automatic summarization.
          Technologies that can make a coherent summary take into account variables such as
          length, writing style and syntax. An example of the use of summarization technology
          is search engines such as Google. Document summarization is another."""

from summa import summarizer
res = summarizer.summarize(text); print(type(res), res)  # <class 'str'>; 抽取句子
# 'Automatic summarization is the process of reducing a text document with a computer
# program in order to create a summary that retains the most important points of the
# original document.'
sum_sens = summarizer.summarize(text, split=True); print(sum_sens)  # output as list of sentence
# ['Automatic summarization is the process of reducing a text document with a', 'Document summarization is another.']

## for sentences
from summa.preprocessing.textcleaner import clean_text_by_sentences
res = clean_text_by_sentences(text); print(res, type(res))  # <class 'list'>
# [Original unit: 'Automatic summarization is the process of reducing a text document with a' *-*-*-* Processed unit: 'automat summar process reduc text document', Original unit: 'computer program in order to create a summary that retains the most important points' *-*-*-* Processed unit: 'program order creat summari retain import point', Original unit: 'of the original document.' *-*-*-* Processed unit: 'origin document', Original unit: 'As the problem of information overload has grown, and as' *-*-*-* Processed unit: 'problem inform overload grown', Original unit: 'the quantity of data has increased, so has interest in automatic summarization.' *-*-*-* Processed unit: 'quantiti data increas automat summar', Original unit: 'Technologies that can make a coherent summary take into account variables such as' *-*-*-* Processed unit: 'technolog make coher summari account variabl', Original unit: 'length, writing style and syntax.' *-*-*-* Processed unit: 'length write style syntax', Original unit: 'An example of the use of summarization technology' *-*-*-* Processed unit: 'exampl use summar technolog', Original unit: 'is search engines such as Google.' *-*-*-* Processed unit: 'search engin googl', Original unit: 'Document summarization is another.' *-*-*-* Processed unit: 'document summar']
print([sentence.token for sentence in res])  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']
sentences = [sentence.text for sentence in res]; print(sentences)  # original text
# ['Automatic summarization is the process of reducing a text document with a', 'computer program in order to create a summary that retains the most important points', 'of the original document.', 'As the problem of information overload has grown, and as', 'the quantity of data has increased, so has interest in automatic summarization.', 'Technologies that can make a coherent summary take into account variables such as', 'length, writing style and syntax.', 'An example of the use of summarization technology', 'is search engines such as Google.', 'Document summarization is another.']
print([sentences.index(sen) for sen in sum_sens])  # [0, 9]
graph = summarizer.get_graph(text)
print(graph.nodes())  # ['automat summar process reduc text document', 'program order creat summari retain import point', 'origin document', 'problem inform overload grown', 'quantiti data increas automat summar', 'technolog make coher summari account variabl', 'length write style syntax', 'exampl use summar technolog', 'search engin googl', 'document summar']

## keywords
Example #46
0
 def test_few_distinct_words_summarization_with_split_is_empty_list(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(summarize(text, split=True), [])
def summarize(sentences, pub):
    try:
        pub.summary = summarizer.summarize(sentences, words=100)
    except Exception as e:
        print "Line number " + get_linenumber()
        print e
Example #48
0
 def test_summary_from_unrelated_sentences_is_not_empty_string(self):
     # Tests that the summarization of a text with unrelated sentences is not empty string.
     text = get_text_from_test_data("unrelated.txt")
     self.assertNotEqual(summarize(text), "")
def resumir(texto,lenguaje='spanish'):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish')
    else:
        return summarizer.summarize(texto, language=lenguaje)
def resumir(texto,lenguaje='spanish',ratio=0.2):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish',ratio=ratio)
    else:
        return summarizer.summarize(texto, language=lenguaje,ratio=ratio)
Example #51
0
 def test_empty_text_summarization_is_empty_string(self):
     self.assertEqual(summarize(""), "")
Example #52
0
 def test_few_distinct_words_summarization_wstopwords_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     additional_stoplist = ["here","there"]
     self.assertEqual(summarize(text, additional_stopwords=additional_stoplist), "")
            resultList.extend([elements[i].childNodes[0].nodeValue])
            materias.append(resultList[i])

target_names=GeneraTarget(materias)  
############################################################################
############################################################################
def leerTags(path,tag):
    midom=parse(path)
    elements = midom.getElementsByTagName(tag)
    resultList1 = []

    if len(elements) > 0:
        for i in range(0,len(elements)):
            resultList1.extend([elements[i].childNodes[0].nodeValue])
    return resultList1
"""
summarizer.summarize(texto,lenguaje,ratio,words)
"""    
def resumir(texto,lenguaje='spanish',ratio=0.2):
    if not(lenguaje):
        return summarizer.summarize(texto, language='spanish',ratio=ratio)
    else:
        return summarizer.summarize(texto, language=lenguaje,ratio=ratio)
############################################################################
############################################################################   
        
 
"""
Funcion encargada de transformar materias para su entrada 
"""
def transformaMaterias(materiasF):
Example #54
0
 def test_spanish(self):
     # Test the summarization module with accented characters.
     text = get_text_from_test_data("spanish.txt")
     self.assertIsNotNone(summarize(text, language="spanish"))
Example #55
0
    sys.setdefaultencoding('utf8')
    """
    nltk.data.path.append('/home/kariminf/Data/NLTK/')



    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    """

    file = open(SIZE_FILE, 'r')
    while 1:
        line = file.readline()
        if line == '':
			break;
        parts = line.split(",")
        sizes[parts[0]] = int(parts[1])
    file.close()
    

    #nltk.data.path.append('/home/kariminf/Data/NLTK/')
    for eval in sizes:
		txt_path = "src/body/text/en/" + eval
		print(txt_path)
		text = readTextFile(txt_path)
		sentences = summarize(text,language=LANGUAGE, split=True)
		summary = extract(sentences, sizes[eval])
		fout = open("baselines/summa_textrank/en/" + eval[:-9] + ".txt", "w")
		fout.write(summary)
		fout.close()
Example #56
0
 def test_few_distinct_words_summarization_is_empty_string(self):
     text = get_text_from_test_data("few_distinct_words.txt")
     self.assertEqual(summarize(text), "")