Beispiel #1
0
def _lexrank(cid):
    """
        Run LexRank on all sentences from all documents in a cluster.

    :param cid:
    :return: rank_records
    """
    _, processed_sents = dataset_parser.cid2sents(
        cid)  # 2d lists, docs => sents
    flat_processed_sents = list(
        itertools.chain(*processed_sents))  # 1d sent list

    lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])
    scores = lxr.rank_sentences(flat_processed_sents,
                                threshold=None,
                                fast_power_method=True)

    sid2score = dict()
    abs_idx = 0
    for doc_idx, doc in enumerate(processed_sents):
        for sent_idx, sent in enumerate(doc):
            sid = config.SEP.join((str(doc_idx), str(sent_idx)))
            score = scores[abs_idx]
            sid2score[sid] = score

            abs_idx += 1

    sid_score_list = rank_sent.sort_sid2score(sid2score)
    rank_records = rank_sent.get_rank_records(sid_score_list,
                                              sents=processed_sents,
                                              flat_sents=False)
    return rank_records
Beispiel #2
0
def summary_nmf_method(file_folder, sumLen):

    sent_path = os.path.join(file_folder, 'sent_list.pkl')
    sent_list = joblib.load(sent_path)

    docs_path = os.path.join(file_folder, 'docs_list.pkl')
    docs = joblib.load(docs_path)

    n = len(sent_list)

    if (n > 1):
        GRS_sen = get_grs_score(file_folder)
        surface_score = get_surface_score(docs)
        # p=pagerank(docs)

        lxr = LexRank(docs)
        lx = lxr.rank_sentences(docs, threshold=None, fast_power_method=True)

        lxr_score = np.array(lx)
        maxLex = lxr_score.max()
        lxr_score = (100 * lxr_score) / maxLex

        total_score = []

        for i in range(n):
            t_sum = float(GRS_sen[i]) + float(surface_score[i]) + float(
                lxr_score[i])
            total_score.append(t_sum)

        copy_score = total_score.copy()
        top_list = get_top_list(copy_score, sumLen)

        summary_final = '<h3>Total Sentecnes :' + str(
            len(total_score)) + '</h3>'
        summary_final += '<h3>Selected Sentecnes :' + str(sumLen) + '</h3>'

        for i in range(n):
            if (total_score[i] in top_list):
                summary_final += '<p style="color:#00ff00">' + sent_list[
                    i] + '<br>' + str(total_score[i]) + '</p>'
            else:
                summary_final += '<p style="color:#ff0000">' + sent_list[
                    i] + '<br>' + str(total_score[i]) + '</p>'

        os.remove(sent_path)
        os.remove(docs_path)

        return summary_final
    elif (n == 1):
        os.remove(sent_path)
        os.remove(docs_path)

        return sent_list[0]
    else:
        os.remove(sent_path)
        os.remove(docs_path)

        return 'No adequate sentences found for summary.'
Beispiel #3
0
    async def on_post(self, req, resp):
        data = await req.media()
        text = data["text"]
        doc = nlp(text)
        results = []
        start_id = 0
        end_id = 0
        documents = []
        for sent_id, sent in enumerate(doc.sents):
            tokens = []
            for ent_id, ent in enumerate(sent.ents):
                # 文頭から最初のエンティティまでを追加
                end_id = ent.start_char
                if start_id != end_id:
                    tmp = {
                        'text': text[start_id:end_id],
                        'label': 'O'
                    }
                    tokens.append(tmp)

                start_id = ent.start_char
                end_id = ent.end_char
                tmp = {
                    'id': f"{sent_id}_{ent_id}",
                    'text': text[start_id:end_id],
                    'label': ent.label_
                }
                tokens.append(tmp)
                start_id = ent.end_char
            # 最後のエンティティから文末までを追加
            tokens.append({
                'id': f"{sent_id}_{ent_id+1}",
                'text': text[start_id:sent.end_char],
                'label': 'O'
            })
            start_id = sent.end_char

            tmp = {
                'id': sent_id,
                'sent': tokens
            }
            results.append(tmp)

            extract_tokens = []
            for token in sent:
                if token.pos_ in ['PROPM', 'NOUN', 'VERB', 'ADJ']:
                    extract_tokens.append(token.lemma_)
            documents.append(" ".join(extract_tokens))

        lexrank = LexRank(documents)
        scores = lexrank.rank_sentences(documents, threshold=0.0)
        ranking = pd.Series(scores).rank(
            method="min", ascending=False).tolist()
        for i in range(len(ranking)):
            results[i]['rank'] = int(ranking[i])

        resp.media = results
Beispiel #4
0
    def train(self):
        documents = []
        documents_dir = Path(self.training_dir)

        for file_path in documents_dir.files('*.txt'):
            with file_path.open(mode='rt', encoding='utf-8') as fp:
                documents.append(fp.readlines())

        self.lxr = LexRank(documents, stopwords=STOPWORDS['en'])
Beispiel #5
0
def get_ranked_sentences_lexrank(filepath):
    raw_text = list()
    raw_text.append(read_file_to_array(filepath))
    lxr = LexRank(raw_text, stopwords=STOPWORDS['en'])
    sentences = sent_tokenize(read_file_to_string(filepath))
    summary_sentences = lxr.get_summary(sentences,
                                        summary_size=floor(len(sentences) / 2),
                                        threshold=.1)
    # return summary_sentences
    return sentences
def rank_sentences(cluster_paths: List[str], noise_sentences: List[str]) -> List[str]:
    final_sentences = cluster_paths.copy()
    final_sentences.extend(noise_sentences)

    ranker = LexRank(final_sentences)
    sentence_scores = ranker.rank_sentences(final_sentences)

    scored_sentences = {}
    for idx, score in enumerate(sentence_scores):
        scored_sentences[final_sentences[idx]] = score

    sorted_scored_sentences = sorted(scored_sentences, key=scored_sentences.get)
    return sorted_scored_sentences
Beispiel #7
0
    def condense(self, percent):
        # automatically sets percent to condense by if not specified
        if not percent:
            percent = self.get_optimal_condense_percent()

        # calculates number of sentences to return based on input
        num_sentences = int(len(self.sentences) * percent)
        if num_sentences < 1:
            num_sentences = 1
        elif num_sentences > len(self.sentences):
            num_sentences = len(self.sentences)

        # create corpus from docs
        dirname = os.path.dirname(__file__)
        # checks if dumped json exists; if yes, loads that
        if os.path.isfile(os.path.join(dirname, 'corpus.json')):
            with open(os.path.join(dirname, 'corpus.json'), 'r') as f:
                documents = json.load(f)
        # otherwise, creates new corpus based on files in training_data directory
        else:
            documents = make_corpus_from_files('training_data')

        lxr = LexRank(documents, stopwords=STOPWORDS['en'])

        # create summary
        sentences_to_return = lxr.get_summary(self.sentences,
                                              summary_size=num_sentences)

        # joins sentences to make text body

        # list for each paragraph
        output = [[] for _ in self.paragraphs]

        # copies self.paragraphs to prevent destructive edits
        paragraphs = [paragraph[:] for paragraph in self.paragraphs]

        for sentence in sentences_to_return:
            for i, paragraph in enumerate(paragraphs):
                if sentence in paragraph:
                    output[i].append(sentence)
                    paragraph.remove(sentence)
                    break

        # joins paragraph sentences with spaces
        output = [' '.join(paragraph) for paragraph in output]
        # joins paragraphs with newlines if paragraphs aren't empty
        output = '\n\n'.join([x for x in output if x.strip() != ''])

        return output
    def extract_summary(self, ):
        data = read_json(self.full_path)
        articles = []
        abstracts = []

        for item in data:
            articles.append(item['article'])
            abstracts.append([item['abstract']])
        
        lxr = LexRank(articles, stopwords=STOPWORDS['en'])
 
        summaries = [lxr.get_summary(x, summary_size=self.summary_size, threshold=self.threshold) for x in tqdm(articles, desc="LexRank:")]
        res = test_rouge(summaries, abstracts, self.processors)

        return res
Beispiel #9
0
def build_sim_items_e2e_tfidf_with_lexrank(cid,
                                           query,
                                           max_ns_doc=None,
                                           retrieved_dp=None,
                                           rm_dialog=True):
    """
        Initialize LexRank with document-wise organized sentences to get true IDF.

    :param cid:
    :param query:
    :param max_ns_doc:
    :param retrieved_dp:
    :param rm_dialog:
    :return:
    """
    if retrieved_dp:
        original_sents, processed_sents = load_retrieved_sentences(
            retrieved_dp=retrieved_dp, cid=cid)
    else:
        if 'tdqfs' in config.test_year:
            original_sents, processed_sents = dataset_parser.cid2sents_tdqfs(
                cid)
        else:
            original_sents, processed_sents = dataset_parser.cid2sents(
                cid, rm_dialog=rm_dialog,
                max_ns_doc=max_ns_doc)  # 2d lists, docs => sents

    lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])

    doc_sents = list(itertools.chain(*processed_sents))  # 1d sent list
    doc_sents = copy.deepcopy(
        doc_sents)  # avoid affecting the original doc_sents list
    doc_sents.append(query)

    sim_mat = lxr.get_tfidf_similarity_matrix(sentences=doc_sents)

    doc_sim_mat = sim_mat[:-1, :-1]
    rel_scores = sim_mat[-1, :-1]
    # logger.info('doc_sim_mat: {}, rel_scores: {}'.format(doc_sim_mat.shape, rel_scores.shape))

    sim_items = {
        'doc_sim_mat': doc_sim_mat,
        'rel_scores': rel_scores,
        'processed_sents': processed_sents,
        'original_sents': original_sents,
    }

    return sim_items
Beispiel #10
0
def test_lexrank():
    documents = []
    documents_dir = DATA_ROOT / 'bbc_politics'
    document_files = documents_dir.files()

    for file_path in document_files:
        with gzip.open(file_path, mode='rt', encoding='utf-8') as fp:
            documents.append(fp.readlines())

    lxr = LexRank(
        documents,
        stopwords=STOPWORDS['en'],
        keep_numbers=False,
        keep_emails=False,
        include_new_words=True,
    )

    sentences = [
        'One of David Cameron\'s closest friends and Conservative allies, '
        'George Osborne rose rapidly after becoming MP for Tatton in 2001.',

        'Michael Howard promoted him from shadow chief secretary to the '
        'Treasury to shadow chancellor in May 2005, at the age of 34.',

        'Mr Osborne took a key role in the election campaign and has been at '
        'the forefront of the debate on how to deal with the recession and '
        'the UK\'s spending deficit.',

        'Even before Mr Cameron became leader the two were being likened to '
        'Labour\'s Blair/Brown duo. The two have emulated them by becoming '
        'prime minister and chancellor, but will want to avoid the spats.',

        'Before entering Parliament, he was a special adviser in the '
        'agriculture department when the Tories were in government and later '
        'served as political secretary to William Hague.',

        'The BBC understands that as chancellor, Mr Osborne, along with the '
        'Treasury will retain responsibility for overseeing banks and '
        'financial regulation.',

        'Mr Osborne said the coalition government was planning to change the '
        'tax system \"to make it fairer for people on low and middle '
        'incomes\", and undertake \"long-term structural reform\" of the '
        'banking sector, education and the welfare state.',
    ]

    summary = lxr.get_summary(sentences, discretize=False)
    assert summary == [sentences[5]]
Beispiel #11
0
class Rank():
    def __init__(self, training_dir=None):
        self.training_dir = training_dir
        self.lxr = None

    def train(self):
        documents = []
        documents_dir = Path(self.training_dir)

        for file_path in documents_dir.files('*.txt'):
            with file_path.open(mode='rt', encoding='utf-8') as fp:
                documents.append(fp.readlines())

        self.lxr = LexRank(documents, stopwords=STOPWORDS['en'])

    def remove_new_line(self, text):
        """
        Removes all new lines chars in text
        """
        return text.replace('\n', ' ')

    def setence_tokenize(self, text):
        """
        sentence tokenize
        """
        return sent_tokenize(text)

    def sort_rankings(self, scores_sents):
        idx = np.argsort(scores_sents[0])[::-1]
        scores = np.array(scores_sents[0])[idx]
        sentences = np.array(scores_sents[1])[idx]
        return list(zip(scores, sentences))

    def rank(self, text):
        sentences = text
        sentences = self.remove_new_line(sentences)
        sentences = self.setence_tokenize(sentences)
        sentences = self.lxr.get_summary(sentences,
                                         summary_size=6,
                                         threshold=.85)

        scores_cont = self.lxr.rank_sentences(
            sentences,
            threshold=None,
            fast_power_method=True,
        )

        return self.sort_rankings((scores_cont, sentences))
Beispiel #12
0
def test_lexrank():
    documents = []
    documents_dir = DATA_ROOT / 'bbc_politics'
    document_files = documents_dir.files()

    for file_path in document_files:
        with gzip.open(file_path, mode='rt', encoding='utf-8') as fp:
            documents.append(fp.readlines())

    lxr = LexRank(
        documents,
        stopwords=STOPWORDS['en'],
        keep_numbers=False,
        keep_emails=False,
        include_new_words=True,
    )

    sentences = [
        'One of David Cameron\'s closest friends and Conservative allies, '
        'George Osborne rose rapidly after becoming MP for Tatton in 2001.',

        'Michael Howard promoted him from shadow chief secretary to the '
        'Treasury to shadow chancellor in May 2005, at the age of 34.',

        'Mr Osborne took a key role in the election campaign and has been at '
        'the forefront of the debate on how to deal with the recession and '
        'the UK\'s spending deficit.',

        'Even before Mr Cameron became leader the two were being likened to '
        'Labour\'s Blair/Brown duo. The two have emulated them by becoming '
        'prime minister and chancellor, but will want to avoid the spats.',

        'Before entering Parliament, he was a special adviser in the '
        'agriculture department when the Tories were in government and later '
        'served as political secretary to William Hague.',

        'The BBC understands that as chancellor, Mr Osborne, along with the '
        'Treasury will retain responsibility for overseeing banks and '
        'financial regulation.',

        'Mr Osborne said the coalition government was planning to change the '
        'tax system \"to make it fairer for people on low and middle '
        'incomes\", and undertake \"long-term structural reform\" of the '
        'banking sector, education and the welfare state.',
    ]

    summary = lxr.get_summary(sentences, threshold=None)
    assert summary == [sentences[5]]
Beispiel #13
0
def lex_rank_summarize(article):
    sentences = sent_tokenize(article)
    summary = LexRank(sentences).get_summary(sentences,
                                             summary_size=int(
                                                 len(sentences) / 2),
                                             threshold=.1)
    return "\n".join(summary)
Beispiel #14
0
 def __init__(self, all_messages):
     self.documents = {}
     for stream, data in all_messages.items():
         for topic, messages in data["topics"].items():
             text = "\n".join(
                 self.clean_content(message) for message in messages)
             self.documents[(stream, topic)] = [
                 sentence.strip() for sentence in text.splitlines()
                 if sentence.strip()
             ]
     self.summarizer = LexRank(
         self.documents.values(),
         keep_emails=True,
         keep_urls=True,
         stopwords=STOPWORDS["en"],
     )
Beispiel #15
0
def summarize(text_file, cat, summary_size, threshold):
    if cat not in ['business', 'entertainment', 'politics', 'sport', 'tech']:
        raise ValueError('category must be one of business, entertainment, politics, sport, tech')
    if summary_size <= 0:
        raise ValueError('number of summary sentences must be greater than one')
    if not 0.0 < threshold < 1.0:
        raise ValueError('summarize threshold must between 0 and 1')
    # load parsing dataset
    documents = []
    documents_dir = Path('bbc/' + cat)
    for file_path in documents_dir.files('*.txt'):
        with file_path.open(mode='rt', encoding='utf-8') as fp:
            documents.append(fp.readlines())

    # initialize LexRank with dataset
    lxr = LexRank(documents, stopwords=STOPWORDS['en'])

    # Read STT result
    sentences = []
    with open(text_file, 'r') as f:
        sentences.extend(f.read().splitlines())

    # Spelling correction
    for index in range(len(sentences)):
        tmp = ''
        for word in sentences[index].split():
            tmp += spell(word)
            tmp += ' '
        sentences[index] = tmp

    # Non-English word removal
    for index in range(len(sentences)):
        sentences[index] = " ".join(
            w for w in nltk.wordpunct_tokenize(sentences[index]) if w.lower() in words or not w.isalpha())

    # STT translation after correction
    with open('output_text/' + text_file[:-4] + '_corrected.txt', 'a', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

    # get summary with classical LexRank algorithm
    summary = lxr.get_summary(sentences, summary_size, threshold)
    with open('output_text/' + text_file[:-4] + '_sum.txt', 'a', encoding='utf-8') as f:
        for sentence in summary:
            f.write(sentence + '\n')
Beispiel #16
0
 def _make_lexrank_obj(self, stemming=True):
     idf_docs = [
         doc for summ in self.summarizers for doc in summ.topic.docs
     ]
     if stemming:
         idf_docs = [Summarizer._stemming(doc) for doc in idf_docs]
     lxr = LexRank(idf_docs, stopwords=STOPWORDS['en'])
     # print(lxr._calculate_idf())
     return lxr
Beispiel #17
0
def summary_nmf_method(file_folder,sumLen):

    sent_path=os.path.join(file_folder,'sent_list.pkl')
    sent_list=joblib.load(sent_path)

    docs_path=os.path.join(file_folder,'docs_list.pkl')
    docs=joblib.load(docs_path)

    n = len(sent_list)

    if(n>1):
        GRS_sen = get_grs_score(file_folder)
        surface_score = get_surface_score(docs)
        # p=pagerank(docs)

        lxr = LexRank(docs)
        lx = lxr.rank_sentences(docs, threshold=None, fast_power_method=True)

        lxr_score = np.array(lx)
        maxLex = lxr_score.max()
        lxr_score = (100 * lxr_score) / maxLex

        total_score = []

        for i in range(n):
            t_sum = float(GRS_sen[i]) + float(surface_score[i]) + float(lxr_score[i])
            total_score.append(t_sum)

        copy_score = total_score.copy()
        top_list = get_top_list(copy_score, sumLen)

        summary_final = ''

        for i in range(n):
            if total_score[i] in top_list:
                summary_final += sent_list[i] + ' \n '

        return summary_final
    elif(n==1):
        return sent_list[0]
    else:
        return 'No adequate sentences found for summary.'
Beispiel #18
0
def lexrank_summarize(corpus):
	list_of_summarization = []

	documents = [ split_sentences(sample.replace("story_separator_special_tag", "\n")) for sample in corpus ]
	print("[" + "Document Size: " + str(len(documents)) + "]")
	print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "Begin building LexRank model...")	
	lxr = LexRank(documents, stopwords=STOPWORDS['en'])
	print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "LexRank model successfully built...")

	for i in range(len(documents)):
		sample = documents[i]
		summary = lxr.get_summary(sample, summary_size=len(sample))
		articles = corpus[i].split("story_separator_special_tag")

		words_counter = 0
		summary_counter = 0
		tmp_summary = [ [] for _ in range(len(articles)) ]

		while words_counter < 500 and summary_counter < len(summary):
			flag = 0
			for j in range(len(articles)):
				if summary[summary_counter] in articles[j]:
					tmp_summary[j].append(summary[summary_counter])
					words_counter += len(summary[summary_counter].split(" "))
					flag = 1
			if flag == 0:
				print("[Error] Summary not in original sample.", summary[summary_counter], i)
			summary_counter += 1
			
		# print("words_counter, summary_counter, total summary", words_counter, summary_counter, len(summary))
		for k in range(len(tmp_summary)):
			tmp_summary[k] = " newline_char ".join(tmp_summary[k])
		list_of_summarization.append(" story_separator_special_tag ".join(tmp_summary))

		if i %100 == 0:
			print("------")
			print(i)
			print("------")
		# if i == 100:
		# 	break

	return list_of_summarization
Beispiel #19
0
def init_lexrank(review_path):
    """Pass the reviews text file (reviews only)"""
    global tokenizer, lxr
    reviews = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    review_file = Path(review_path)
    with review_file.open(mode='rt', encoding='utf-8') as fp:
        reviews.append(fp.readlines())

    lxr = LexRank(reviews, stopwords=STOPWORDS['en'])
    return
Beispiel #20
0
    def _make_lexrank_obj(self):
        idf_docs = [doc for summ in self.summarizers for doc in summ.topic.docs]
        # print('idf_docs before length: {}'.format(len(idf_docs)))
        seen = set()
        idf_docs = [doc for doc in idf_docs if doc.id not in seen and not seen.add(doc.id)] # uniqify list of docs
        # print('idf_docs after length: {}'.format(len(idf_docs)))

        if STEMMING:
            idf_docs = [Summarizer._stemming(doc) for doc in idf_docs]
        lxr = LexRank(idf_docs, stopwords=STOPWORDS['en'])
        # print(lxr._calculate_idf())
        return lxr
Beispiel #21
0
def _score_graph_initially(sim_mat, rel_vec, cid, damp, abs2sid=None):
    # todo: check if feeding placeholder documents to init LexRank does no harm
    # _, processed_sents = dataset_parser.cid2sents(cid, rm_dialog=rm_dialog)  # 2d lists, docs => sents
    # lxr = LexRank(processed_sents, stopwords=STOPWORDS['en'])
    doc_place_holder = [['test sentence 1', 'test sentence 2'], ['test sentence 3']]
    lxr = LexRank(doc_place_holder, stopwords=STOPWORDS['en'])
    params = {
        'similarity_matrix': sim_mat,
        'threshold': None,
        'fast_power_method': True,
        'rel_vec': rel_vec,
        'damp': damp,
    }
    scores = lxr.rank_sentences_with_sim_mat(**params)

    sid2score = dict()

    for abs, sc in enumerate(scores):
        sid2score[abs2sid[abs]] = sc

    return sid2score
Beispiel #22
0
def getResume(sentences, summary_size, threshold):
    documents = []
    documents_dir = Path('./db')
    stopwords = {}
    stopwords_dir = Path('./static/stopwords-id.txt')

    for file_path in documents_dir.files('*.txt'):
        with file_path.open(mode='rt', encoding='utf-8',
                            errors='ignore') as fp:
            documents.append(fp.readlines())

    # get the stpwords
    with stopwords_dir.open(mode='rt', encoding='utf-8') as stopFile:
        stopwords['id'] = set(stopFile.readlines())
        stopFile.close()

    lxr = LexRank(documents, stopwords=stopwords['id'])

    summary = lxr.get_summary(sentences,
                              summary_size=int(summary_size),
                              threshold=threshold)

    return summary
Beispiel #23
0
class Summarizer:
    def __init__(self, all_messages):
        self.documents = {}
        for stream, data in all_messages.items():
            for topic, messages in data["topics"].items():
                text = "\n".join(
                    self.clean_content(message) for message in messages)
                self.documents[(stream, topic)] = [
                    sentence.strip() for sentence in text.splitlines()
                    if sentence.strip()
                ]
        self.summarizer = LexRank(
            self.documents.values(),
            keep_emails=True,
            keep_urls=True,
            stopwords=STOPWORDS["en"],
        )

    def get_summary(self, stream, topic, show_url_list=True):
        document = self.documents[(stream, topic)]
        threshold = 0.03
        summary_size = 2 if len(document) > 5 else 1
        fast_power_method = True
        lex_scores = self.summarizer.rank_sentences(
            document, threshold=threshold, fast_power_method=fast_power_method)

        sorted_ix = np.argsort(lex_scores)[::-1]
        url_list = self.get_url_list(document) if show_url_list else []
        return (
            [document[i] for i in sorted(sorted_ix[:summary_size])],
            url_list,
        )

    @staticmethod
    def get_url_list(document):
        links = [
            "[{link}]({link})".format(
                link=plain_link) if plain_link else md_link
            for sentence in document
            for (md_link, plain_link) in LINK_RE.findall(sentence)
        ]
        return links

    @staticmethod
    def clean_content(message):
        content = message["content"]
        return PUNCTUATION_RE.sub("\\1\n", content)
from lexrank import STOPWORDS, LexRank
from path import Path

documents = []
documents_dir = Path('bbc/politics')

for file_path in documents_dir.files('*.txt'):
    with file_path.open(mode='rt', encoding='utf-8') as fp:
        documents.append(fp.readlines())

lxr = LexRank(documents, stopwords=STOPWORDS['en'])

sentences = [
    'One of David Cameron\'s closest friends and Conservative allies, '
    'George Osborne rose rapidly after becoming MP for Tatton in 2001.',

    'Michael Howard promoted him from shadow chief secretary to the '
    'Treasury to shadow chancellor in May 2005, at the age of 34.',

    'Mr Osborne took a key role in the election campaign and has been at '
    'the forefront of the debate on how to deal with the recession and '
    'the UK\'s spending deficit.',

    'Even before Mr Cameron became leader the two were being likened to '
    'Labour\'s Blair/Brown duo. The two have emulated them by becoming '
    'prime minister and chancellor, but will want to avoid the spats.',

    'Before entering Parliament, he was a special adviser in the '
    'agriculture department when the Tories were in government and later '
    'served as political secretary to William Hague.',
Beispiel #25
0
#!/usr/bin/python3
# -*- coding: utf-8 -*-

from lexrank import LexRank
from path import Path
import sys

documents = []
documents_dir = Path('Dataset')

for file_path in documents_dir.files('*.txt'):
    with file_path.open(mode='rt', encoding='utf-8') as fp:
        x = fp.readlines()
        i = x.index('\n')
        x = x[:i]
        documents.append(x)

lxr = LexRank(documents)

with open('Dataset/' + sys.argv[1], 'r') as f:
    sentences = list(f)

# get summary with classical LexRank algorithm
summary = lxr.get_summary(sentences, summary_size=3, threshold=0.15)
print(summary[0], end='')
print(summary[1], end='')
print(summary[2], end='')
Beispiel #26
0
import os
from shutil import copyfile

log("load reference doc")
documents = []
document_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bbc")

for dirPath, dirNames, fileNames in os.walk(document_dir):
    for f in fileNames:
        try:
            with open(os.path.join(dirPath, f), "rt", encoding="utf8") as file:
                documents.append(file.readlines())
        except Exception as e:
            log("path: %s%s" % (dirPath, f))

lxr = LexRank(documents, stopwords=STOPWORDS['en'])


# 建立wiki上的比對檔案並複製到特定的資料夾
def reference_file(file_group, task_number, sim_type, i):
    ref = [
        "brexit", "missile", "brexit", "brexit", "brexit", "brexit", "catalan",
        "catalan", "crimea", "crimea", "gravitational", "gravitational",
        "brexit", "hk", "catalan", "sewol", "syria", "syria", "turkish"
    ]

    wiki_file = ref[file_group]
    if not os.path.exists(
            "lexrank/{sim_type}/reference".format(sim_type=sim_type)):
        os.mkdir("lexrank/{sim_type}/reference".format(sim_type=sim_type))
class ImportanceEstimationModel(object):

    def load_data(self, train_path, dev_path, test_path):
        train_data = json.load(open(train_path))
        dev_data   = json.load(open(dev_path))
        test_data  = json.load(open(test_path))

        self.train_data = train_data
        self.test_data  = test_data
        self.dev_data   = dev_data

        return self.train_data, self.dev_data, self.test_data

    def print_train_sample(self):
        sample = random.choice(self.train_data)
        claims = sample[0]
        conc   = sample[1]
        print('Conclusion:', conc)
        print('Claims:')
        for idx, claim in enumerate(claims):
            print(idx+1, '.', claim)

    def num_of_pos_tags_feature(self, claim):
        claim_annotated = self.nlp_parser.sentences_to_tags([claim])
        claim_pos_tags = set([x[1] for x in claim_annotated[0]])
        return len(claim_pos_tags)

    def num_of_ne_feature(self, claim):
        named_entities = self.nlp_parser.extract_named_entities(claim)
        return len(named_entities)


    def _build_tfidf_model(self, texts):
        tfidf = TfidfVectorizer()
        tfidf_model = tfidf.fit(texts)
        return tfidf_model

    def _build_lexrank_model(self, texts):
        self.lxrank = LexRank(texts, stopwords=STOPWORDS['en'])

    def _sentiment_features(self, claim):
        claim_words = nltk.word_tokenize(claim)

        num_of_positive_words = 0
        num_of_negative_words = 0
        num_of_neutral_words  = 0
        for word in claim_words:
            synsets = list(swn.senti_synsets(word))
            if len(synsets) == 0:
                num_of_neutral_words +=1
            else:
                syn = synsets[0]
                if syn.pos_score() > syn.neg_score():
                    num_of_positive_words +=1
                elif syn.pos_score() < syn.neg_score():
                    num_of_negative_words +=1
                else:
                    num_of_neutral_words+=1
        
        return num_of_positive_words, num_of_negative_words , num_of_neutral_words


    def _num_of_words_feature(self, claim):
        claim_words = nltk.word_tokenize(claim)
        return len(claim_words)

    def _tfidf_features(self, claim):
        claim_words = nltk.word_tokenize(claim)
        
        # Avg. tfidf
        tfidf_vector = self.tfidf_model.transform([claim])
        avg_tfidf_feature = np.sum(tfidf_vector.toarray())/len(claim_words)
        max_tfidf_feature = np.max(tfidf_vector.toarray())

        return avg_tfidf_feature, max_tfidf_feature

    def _claim_features(self, claim, claims_text):

        # Number of words
        num_of_words_feature = self._num_of_words_feature(claim['text'])
        
        # Avg. Max. tfidf
        avg_tfidf_feature, max_tfidf_feature = self._tfidf_features(claim['text'])
        
        # Number of postive/negative/neutral words
        num_of_positive_words, num_of_negative_words , num_of_neutral_words  = self._sentiment_features(claim['text'])
        
        # Number of POS tags and Number of Named Entities
        poss = set([p['type'] for p in claim['pos']])
        num_of_pos_tags = len(poss)
        num_of_ne  = len(claim['named_entities'])


        return [num_of_words_feature, 
                avg_tfidf_feature, 
                max_tfidf_feature,
                num_of_positive_words, 
                num_of_negative_words, 
                num_of_neutral_words,
                num_of_ne, num_of_pos_tags]

    def _instance_features(self, claims):
        claims_sents = [claim['text'] for claim in claims] 
        claims_text  = ' '.join(claims_sents)

        claims_centroidness_scores = self.lxrank.rank_sentences(claims_sents, threshold=None, fast_power_method=False)
        claims_features = [self._claim_features(claim, claims_text) + claims_centroidness_scores[i] for i, claim in enumerate(claims)]

        return np.atleast_2d(claims_features)

    def instance_scores(self, claims, summary):
        claims_labels = []
        for claim in claims:
            claim_tokens   = set(nltk.word_tokenize(claim['text']))
            summary_tokens = set(nltk.word_tokenize(summary))

            shared_tokens = claim_tokens.intersection(summary_tokens)

            #overlap_ratio = len(shared_tokens)/(len(claim_tokens) + len(summary_tokens))
            
            claims_labels.append(len(shared_tokens))
        
        return claims_labels

    def feature_representation(self, data):
        # 1. build a tf-idf model over the training data
        arguments = [' '.join([claim['text'] for claim in argument['claims']]) for argument in data]
        self.tfidf_model = self._build_tfidf_model(arguments)

        arguments = [[claim['text'] for claim in argument['claims']] for argument in data]
        self.lxrank_model = self._build_lexrank_model(arguments)

        # 2. Encode training data into features
        self.train_X = []
        self.train_Y = []

        for argument in data:
            claims     = argument['claims']
            conclusion = argument['conclusion']['text']

            claims_vectors = self._instance_features(claims)
            claims_scores  = self.instance_scores(claims, conclusion)

            for claim_vector, claim_label in zip(claims_vectors, claims_scores):
                self.train_X.append(claim_vector)
                self.train_Y.append(claim_label)


        self.train_X = np.array(self.train_X)
        self.train_Y = np.array(self.train_Y)

        #Normalize claims_scores into [0,1]
        labels_scaler = MinMaxScaler()
        labels_scaler.fit(self.train_Y.reshape(-1, 1))
        self.train_Y = labels_scaler.transform(self.train_Y.reshape(-1, 1)).reshape(-1)

        return self.train_X, self.train_Y

    def train_svr(self, train_X, train_Y):
        svr_params = {'C': [0.001, 0.1, 1.0, 10, 100]}
        svr = SVR()
        
        clf = GridSearchCV(svr, svr_params, cv=5, scoring='neg_mean_absolute_error', return_train_score=False)
        clf.fit(train_X, train_Y)

        best_ridge = clf.best_estimator_
        
        self.best_ridge = best_ridge

        return clf.best_score_


    def kendalltau_evaluation(self, test_data):
        from scipy import stats
        
        total_tau = 0
        for sample in test_data:
            claims     = sample[0]
            conclusion = sample[1]

            #Predict scores of each claim
            claims_vectors = self._instance_features(claims)
            ground_truth_scores  = self.instance_scores(claims, conclusion)
            ground_pred_scores  = self.best_ridge.predict(claims_vectors)

            tau, _ = stats.kendalltau(ground_truth_scores, ground_pred_scores)
            total_tau += tau

        return total_tau/len(test_data)

    def score_data(self, data):
        for sample in data:
            claims     = sample['claims']
            conclusion = sample['conclusion']['text']

            #Predict scores of each claim
            claims_vectors = self._instance_features(claims)
            claims_scores  = self.best_ridge.predict(claims_vectors)

            for claim, score in zip(claims, claims_scores):
                claim['imprtance_score'] = score

            sample['claims'] = claims

        return data

    def mrr_evaluation(self, test_data):
        mrr_value = 0
        for sample in test_data:
            claims     = sample['claims']
            conclusion = sample['conclusion']['text']

            #Predict scores of each claim
            claims_vectors = self._instance_features(claims)
            claims_labels  = self.instance_scores(claims, conclusion)
            claims_labels  = [c_score > 0 for c_score in claims_labels]

            claims_scores  = self.best_ridge.predict(claims_vectors)

            #Sort claims based on the score
            scores_labels_list = list(zip(claims_scores, claims_labels))
            sorted_claims = sorted(scores_labels_list, key= lambda x : -x[0])

            rank = 1
            for x in  sorted_claims:
                if x[1]:
                    break
                rank +=1

            mrr_value += 1/rank

        return mrr_value/len(test_data)
Beispiel #28
0
from .search_resources import final_resources, wiki_summary
from .unfurling import OPG
from .keyword_ner_search_query import NER

# Run the following command in terminal to connect to redis channel
# docker run -p 6379:6379 -d redis:5

print('loading dataset and initializing...')
documents = []
documents_dir = Path('/home/pranshu/GAMR/gamr/meetingmode/total')

for file_path in documents_dir.files('*.txt'):
    with file_path.open(mode='rt', encoding='latin1') as fp:
        documents.append(fp.readlines())

lxr = LexRank(documents, stopwords=STOPWORDS['en'])
print('dataset load done!')
print('server is running!')


class StudyConsumer(AsyncWebsocketConsumer):
    async def connect(self):
        self.room_name = self.scope['url_route']['kwargs']['username']
        self.room_group_name = 'study_%s' % self.room_name

        # Join room group
        await self.channel_layer.group_add(self.room_group_name,
                                           self.channel_name)

        await self.accept()
Beispiel #29
0
    '../gen_backend/backend/cliche_sentences.txt')
horror_sentences = get_filler_sentences(
    '../gen_backend/backend/horror_sentences.txt')
romance_sentences = get_filler_sentences(
    '../gen_backend/backend/romance_sentences.txt')
violence_sentences = get_filler_sentences(
    '../gen_backend/backend/violence_sentences.txt')

num_boring = (((int(boring_rating) / 5) * 50) / 100) * len(boring_sentences)
num_cliche = (((int(cliche_rating) / 5) * 50) / 100) * len(boring_sentences)
num_horror = (((int(horror_rating) / 5) * 50) / 100) * len(boring_sentences)
num_romance = (((int(romance_rating) / 5) * 50) / 100) * len(boring_sentences)
num_violence = ((
    (int(violence_rating) / 5) * 50) / 100) * len(boring_sentences)

lxr = LexRank(paraphrase_summary, stopwords=STOPWORDS['en'])
boring_scores_cont = lxr.rank_sentences(boring_sentences,
                                        threshold=None,
                                        fast_power_method=True)
cliche_scores_cont = lxr.rank_sentences(cliche_sentences,
                                        threshold=None,
                                        fast_power_method=True)
horror_scores_cont = lxr.rank_sentences(horror_sentences,
                                        threshold=None,
                                        fast_power_method=True)
romance_scores_cont = lxr.rank_sentences(romance_sentences,
                                         threshold=None,
                                         fast_power_method=True)
violence_scores_cont = lxr.rank_sentences(violence_sentences,
                                          threshold=None,
                                          fast_power_method=True)
 def _build_lexrank_model(self, texts):
     self.lxrank = LexRank(texts, stopwords=STOPWORDS['en'])
Beispiel #31
0
from lexrank import LexRank
from lexrank.mappings.stopwords import STOPWORDS
from path import Path

documents = []

paths = Path('./bbc-fulltext/bbc').glob('politics/')
# print(paths)
for category in paths:
    for file_path in category.files('*.txt'):
        with file_path.open(mode='r', encoding='utf-8') as file:
            documents.append(file.readlines())

# print(documents[:5])

lxr = LexRank(documents, stopwords=STOPWORDS['en'])

# url = 'https://www.nytimes.com/2021/03/18/opinion/anti-asian-american-violence.html'
# article = Article(url)
# article.download()
# article.parse()

# text = article.text

text = '''
The grim reality of modern American life is that each new mass killing leads to a fevered study of motives and meaning. Was the latest shooter motivated by racism, misogyny, religion, revenge or some combination thereof? Those are not questions that members of a healthy society should routinely be forced to ask or answer.

After eight people — including six people of Asian descent and seven women — were shot to death in Georgia this week, a deputy sheriff chalked the killings up to the suspect’s confessed “sex addiction,” adding that “yesterday was a really bad day” for the alleged shooter. That diagnosis was met with the skepticism it deserved: The same deputy promoted the sale of anti-Asian T-shirts that referred to the coronavirus as an import from “Chy-na.”

It’s difficult to disentangle the vile pathologies that lead a man to take so many innocent lives. It’s also impossible to ignore the context in which the murders were committed and the impact that the tragedy has had on communities across America. In an analysis of nearly 4,000 hate-related incidents targeting Asian-Americans documented this year and last, nearly 70 percent of the victims were women, according to a report by the group Stop AAPI Hate. New York was the second state behind California in the total number of incidents documented by the group.