コード例 #1
0
    def summarize(self, text, n_sents=3):
        """ Summarize a given text and get top sentences """
        try:
            prediction = dict()

            if text:
                if self.lang_code in self.valid_langs:
                    if Utility.get_doc_length(text) > self.n_words:
                        # generate sentences, normalized sentences from text
                        sents, norm_sents = self.p.text_preprocessing(text)
                        # generate doc-term-matrix, term-doc-matrix
                        dt_matrix = self.generate_doc_term_matrix(norm_sents)
                        td_matrix = self.generate_term_doc_matrix(dt_matrix)

                        if self.method == "LSA":
                            lsa = LSA(self.k, td_matrix)
                            term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt
                            # remove singular values below given treshold
                            singular_values = lsa.filter_singular_values(
                                singular_values, self.sv_threshold)
                            # get salience scores from top singular values & topic document matrix
                            salience_scores = lsa.get_salience_scores(
                                singular_values, topic_doc_matrix)
                            # get the top sentence indices for summarization
                            top_sentence_indices = lsa.get_top_sent_indices(
                                salience_scores, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        elif self.method == "TEXT_RANK":
                            tr = TextRank(dt_matrix, td_matrix)
                            # build similarity graph
                            similarity_matrix = tr.similiarity_matrix
                            similarity_graph = tr.get_similarity_graph(
                                similarity_matrix)
                            # compute pagerank scores for all sentences
                            ranked_sents = tr.rank_sentences(similarity_graph)
                            # get the top sentence indices for summarization
                            top_sentence_indices = tr.get_top_sentence_indices(
                                ranked_sents, n_sents)
                            summary = self.generate_summary(
                                sents, top_sentence_indices)
                        else:
                            return "no method found"

                        # apply cleaning for readability
                        summary = Utility.remove_multiple_whitespaces(summary)
                        summary = Utility.remove_trailing_whitespaces(summary)
                        prediction["summary"] = summary
                        prediction["message"] = "successful"
                    else:
                        return "required at least {} words".format(
                            self.n_words)
                else:
                    return "language not supported".format()
            else:
                return "required textual content"
            return prediction
        except Exception:
            logging.error("exception occured", exc_info=True)
コード例 #2
0
    def summarizeText(self, text, lines):
        length = len(text)
        text_div1 = text[0:int(length / 2)]
        text_div2 = text[int(length / 2):int(length - 1)]

        textrank_div1 = TextRank(text_div1)
        textrank_div2 = TextRank(text_div2)
        textresult_div1 = textrank_div1.summarize(10)
        textresult_div2 = textrank_div2.summarize(10)
        textresult = textresult_div1 + textresult_div2
        resultrank = TextRank(textresult)

        return resultrank.summarize(lines)
コード例 #3
0
def select_algorithm(algo, text, num):
    if algo == 'Wordfreq':
        obj = WordFrequency(text, num)
    elif algo == 'TextRank':
        obj = TextRank(text, num)
    elif algo == 'TF_IDF':
        obj = TF_IDF(text, num)
    return obj.summarize_text()


#def download_file():
#	filename = 'summary.pdf'
#	file_dir = 'files'
#	return send_from_directory(file_dir, filename=filename))
コード例 #4
0
    def summarizeTextList(self, textList, lines):
        length = len(textList)
        textList1 = textList[0:int(length / 2)]
        textList2 = textList[int(length / 2):int(length - 1)]

        text1 = ''
        text2 = ''
        for sentence in textList1:
            text1 += sentence + ' '
        for sentence in textList2:
            text2 += sentence + ' '

        textrank1 = TextRank(text1)
        textrank2 = TextRank(text2)
        textresult1 = textrank1.summarize(10)
        textresult2 = textrank2.summarize(10)
        textresult = textresult1 + textresult2
        resultrank = TextRank(textresult)

        return resultrank.summarize(lines)
コード例 #5
0
def top_topics_from_lm():
    print("top_topics_from_lm")
    LM = load_pickled_data("LM_classifier_10.pickle")
    voca = load_voca()
    corpus_info = CorpusInfo()
    articles = load_articles()
    ids, docs = zip(*articles)

    print("Predicting")
    labels = LM.predict_parallel(docs)
    cont_ids, _ = zip(*filter(lambda x: x[1], zip(ids, labels)))
    cont_ids = set(cont_ids)
    print("{}% are controversial".format(len(cont_ids) / len(docs)))
    save_pickle_data(cont_ids, "cont_ids")

    print("Initialize TextRank")
    text_rank = TextRank(docs, voca)
    scorer = LM.token_odd

    payloads = []
    for id, doc in articles:
        if id in cont_ids:
            param = (doc, scorer, corpus_info, text_rank, 4)
            payloads.append(param)

    n_thread = 30
    p = Pool(n_thread)
    g_phrase_score = Counter()
    print("Mapping")
    for phrase_score in p.map(top_phrase_by_scorer, payloads):
        for phrase, score in phrase_score:
            g_phrase_score[phrase] += score

    textrize = get_textrizer_plain(voca)

    result = []
    for phrase, score in g_phrase_score.most_common(300):
        plain_phrase = textrize(str2arr(phrase))
        print("{}\t{}".format(plain_phrase, score))
        result.append((plain_phrase, score))
    save_pickle_data(result, "cont_topics_lm.pickle")
コード例 #6
0
    text = re.sub("\u3000", "", text)

    text_list.append(text)

sample_text = text_list[1]

# set up the text rank parameters

allowPOS = ["n"]
stopwords = ["为了"]
span = 3

# This part implements the Text rank algorithm

## initilization
tr_keyword = TextRank(allowPOS, stopwords, span)

## implement the text rank
tr_keyword.text_rank(sample_text, 10)

# Text relationship Study

## cut the text into list of words
word_pair = tr_keyword._cut(sample_text)

## create the co-occurance matrix (this is for )
co_graph = tr_keyword.co_occurance_matrix(word_pair)
df = tr_keyword.co_occur_graph_to_matrix(co_graph, normalization=True)

## Visulize the text graph
コード例 #7
0
class TestTextRank(unittest.TestCase):
    def setUp(self):
        self.text_rank = TextRank()

    def test_process_html(self):
        article_html = utils.get_article_contents("article1.html")
        expected_article_text = utils.get_article_contents("article1.txt")
        article = self.text_rank.process_html(article_html)

        self.assertEqual(
            "Poll finds Raptors’ playoff run has attracted new fans across Canada",
            article.title,
        )
        self.assertEqual(expected_article_text, article.text)
        self.assertEqual("en", article.config.get_language())

    def test_summarize_from_html(self):
        article_html = utils.get_article_contents("article2.html")

        summary = self.text_rank.summarize_from_html(article_html, 15)
        self.assertTrue(summary)

    def test_evaluate_newspaper_summary_deterministic(self):
        article = utils.get_article_contents("article2.txt")
        sentences = tokenize.sent_tokenize(article)

        scores = self.text_rank.evaluate_newspaper_summary(
            "What's inside the Barcode?", article, sentences, "en")

        ranked_sentences = sorted(((v, k[1]) for k, v in scores.items()),
                                  reverse=True)
        top_sentences = list(score_sentence_tuple[1]
                             for score_sentence_tuple in ranked_sentences[:3])
        self.assertListEqual(
            [
                "If the Scanner doesn’t find it, it will not acknowledge the EAN13 barcode.",
                "In this article, we’re gonna take an example of the EAN13 barcode.",
                "What’s inside the Barcode?",
            ],
            top_sentences,
        )

    def test_evaluate_newspaper_summary_returns_normalized_scores(self):
        article = utils.get_article_contents("article2.txt")
        sentences = tokenize.sent_tokenize(article)

        scores = self.text_rank.evaluate_newspaper_summary(
            "What's inside the Barcode?", article, sentences, "en")

        score_sum = sum(scores.values())
        self.assertEqual(1, score_sum)

    def test_evaluate_textrank_summary_returns_normalized_scores(self):
        # evaluate_textrank_summary depends heavily on word vectorizations
        # which are impractical to load on every test run, so this is all we can do
        article = utils.get_article_contents("article1.txt")
        sentences = tokenize.sent_tokenize(article)

        scores = self.text_rank.evaluate_textrank_summary(sentences)

        score_sum = sum(scores.values())
        self.assertEqual(1, score_sum)

    def test_summarize_returns_15_percent_of_sentences(self):
        article = utils.get_article_contents("article1.txt")
        sentences = tokenize.sent_tokenize(article)

        all_top_sentences = self.text_rank.summarize("test title", article,
                                                     "en", 100)
        top_15p_sentences = self.text_rank.summarize("test title", article,
                                                     "en", 15)

        self.assertEqual(len(sentences), len(all_top_sentences))
        self.assertEqual(math.ceil(len(all_top_sentences) * 15 / 100),
                         len(top_15p_sentences))

    def test_summarize_one_sentence(self):
        summary = self.text_rank.summarize("Hello world!", "Hello world!",
                                           "en", 100)

        self.assertListEqual([], summary)

    def test_summarize_default_language(self):
        summary = self.text_rank.summarize("Hello world!",
                                           "Hello world! Welcome.", None, 100)

        self.assertListEqual(["Welcome."], summary)
コード例 #8
0
 def setUp(self):
     self.text_rank = TextRank()
コード例 #9
0
from flask_migrate import Migrate
from sqlalchemy import create_engine
from settings import Settings
from models import database, Feedback, Account
from account_service import AccountService
from feedback_service import FeedbackService
from text_rank import TextRank

debug = os.environ.get("DEBUG", "false").lower() == "true"
ENV = os.environ.get("ENV", "dev")
DD_API_URL = "https://api.datadoghq.com/api/v1/"

log = logging.getLogger("summarizer_server")

app = Flask(__name__)
textrank = TextRank()
textrank.setup()

# db setup
settings = Settings()
app.config.from_object(settings)
database.init_app(app)

engine = create_engine(settings.SQLALCHEMY_DATABASE_URI)
database.metadata.create_all(engine)

accountservice = AccountService()
feedbackservice = FeedbackService()


@app.route("/v1/")
コード例 #10
0
ファイル: skill_mining.py プロジェクト: vogali/skill-mining
 def load_text_rank(self):
     tr = TextRank()
     tr.generate_ranks()
     self.node_weights = tr.node_weights
コード例 #11
0
from models import database, Feedback, User
from user_service import UserService
from feedback_service import FeedbackService
from text_rank import TextRank

debug = os.environ.get("DEBUG", "false").lower() == "true"

log = logging.getLogger("summarizer_server")

app = Flask(__name__)
app.config.from_object(Settings)
database.init_app(app)

userservice = UserService()
feedbackservice = FeedbackService()
textrank = TextRank()


@app.before_first_request
def before_first_request():
    textrank.setup()


@app.route("/api/")
def index():
    return "Summarizer API"


@app.route("/api/extract", methods=["POST"])
def extract():
    # TODO: call into summary algorithm
コード例 #12
0
ファイル: test.py プロジェクト: wikty/AutoAbstract
def test_text_rank(sentences):
    ranker = TextRank(sentences)
    return ranker.rank()
コード例 #13
0
def text_rank_extract_abstract(sentences, k=5):
    ranker = TextRank(sentences)
    rank_list = ranker.rank()[:k]
    rank_list = sorted(rank_list, key=lambda item: item['index'])
    return '\n'.join([''.join(item['sentence']) for item in rank_list])