コード例 #1
0
    def __init__(self, name):

        self.stemmer = Stemmer('english')
        self.name = name

        if name == "TextRankSummarizer":
            self.summarizer = TextRankSummarizer(self.stemmer)
        elif name == "LsaSummarizer":
            self.summarizer = LsaSummarizer(self.stemmer)
        elif name == "LuhnSummarizer":
            self.summarizer = LuhnSummarizer(self.stemmer)
        elif name == "LexRankSummarizer":
            setattr(LexRankSummarizer, 'rate_sentences', rate_sentences)
            self.summarizer = LexRankSummarizer(self.stemmer)

        elif name == "SumBasicSummarizer":
            self.summarizer = SumBasicSummarizer(self.stemmer)
        elif name == "KLSummarizer":
            self.summarizer = KLSummarizer(self.stemmer)

        #summarizer = EdmundsonSummarizer(stemmer)
        self.summarizer.stop_words = get_stop_words('english')
コード例 #2
0
def test_cosine_similarity_for_the_same_sentence_with_duplicate_words_should_be_one(
):
    """
    We compute similarity of the same sentences. These should be exactly the same and
    therefor have similarity close to 1.0.
    see https://github.com/miso-belica/sumy/issues/58
    """
    sentence1 = ["this", "sentence", "is", "simple", "sentence"]
    tf1 = {"this": 1 / 2, "sentence": 1.0, "is": 1 / 2, "simple": 1 / 2}
    sentence2 = ["this", "sentence", "is", "simple", "sentence"]
    tf2 = {"this": 1 / 2, "sentence": 1.0, "is": 1 / 2, "simple": 1 / 2}
    idf = {
        "this": 2 / 2,
        "sentence": 2 / 2,
        "is": 2 / 2,
        "simple": 2 / 2,
    }

    summarizer = LexRankSummarizer()
    cosine = summarizer.cosine_similarity(sentence1, sentence2, tf1, tf2, idf)

    assert abs(1.0 - cosine) < 0.00001
コード例 #3
0
def text():
    st.markdown(""" # Text Summarization

    ## What is Text Summarization?
    The method of extracting these summaries from the original huge text without losing vital information is called as Text Summarization. It is important for the summary to be a fluent and depict the significant. The main objective is to identify the significant sentences of the text and add them to the summary. You need to note that the summary obtained contains exact sentences from the original text.

    
    
    ## Need of Text Summarization?
    It is useful for summarizing lengthy texts and articles, informative summaries of the news is now everywhere like magazines, news aggregator apps etc which uses text summarization method.

    ## Some common libraries for Text Summarization

    1. NLTK

    2. SpaCy

    3. pysummarization

    Type and click the button below to see the implementation of Text Summarization.
    
        """, True)

    val = st.text_input("Enter a paragraph")
    corp = []
    my_parser = PlaintextParser.from_string(val,Tokenizer('english'))
    lex_rank_summarizer = LexRankSummarizer()
    lexrank_summary = lex_rank_summarizer(my_parser.document, sentences_count=3)
    for sentence in lexrank_summary:
        corp.append(sentence)
  

    out = corp

    
   
    if st.button("Text Summarization"):

        st.success(out)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description='Make barchart from csv.')
    parser.add_argument('-d',
                        '--debug',
                        help='Debugging output',
                        action='store_true')
    parser.add_argument('csvfile',
                        type=argparse.FileType('r'),
                        help='Input csv file')
    args = parser.parse_args()

    print('main(): type(args.csvfile)) = {}'.format(args.csvfile))
    print('')

    ### This works
    df = pd.read_csv(args.csvfile)

    summary_holder = []

    try:
        summarizer = LexRankSummarizer()
        for url in df.valid_website:
            parser = HtmlParser.from_url(url, Tokenizer("english"))
            summary = summarizer(parser.document, 2)

            #saving the summary to a dataframe
            for sentence in summary:
                summary_holder.append(sentence)
                df['summary'] = pd.DataFrame(summary_holder)

        #save dataframe as CSV
        df.to_csv('summaried4', encoding='utf-8', index=False)

    except:
        print('error message')

    return df[['valid_website', 'summary']]
コード例 #5
0
def get_summaries_from_list_of_abstracts(list_of_abstracts, summarizer_type):

    if summarizer_type == 'lsa':
        summarizer = LsaSummarizer(Stemmer("english"))
    elif summarizer_type == 'luhn':
        summarizer = LuhnSummarizer(Stemmer("english"))
    elif summarizer_type == 'lexrank':
        summarizer = LexRankSummarizer(Stemmer("english"))
    elif summarizer_type == 'textrank':
        summarizer = TextRankSummarizer(Stemmer("english"))

    summarizer.stop_words = get_stop_words("english")

    list_of_summaries = []

    for abstract in list_of_abstracts:
        parser = PlaintextParser(abstract, Tokenizer("english"))
        summary = summarizer(parser.document, 3)
        summary_string = " ".join(map(str, summary))
        list_of_summaries.append(summary_string)

    print(list_of_summaries)

    return list_of_summaries
コード例 #6
0
def get_data(badtags):
    # i is a variable used to iterate over the list of titles
    i = -1
    #while i<len(q)-1:
    while i < 83:
        i += 1
        print "\n***********************************************************************************"
        print i
        if i == len(q) - 1:
            break
        pageno = (int)(q[i][2])
        endpageno = (int)(q[i + 1][2])
        title_tokenscopy = q[i][1].split()

        if q[i][1].lower() in badtags:

            print q[i][1], ' - skipped'
            i = remove_badtag(i)
            continue

        nxttitle_tokenscopy = q[i + 1][1].split()
        found = 0
        txt_tokens = []
        k = 0

        text = convert_page_to_txt(pdfname, pageno, endpageno)

        text = to_unicode(text).strip()

        text = re.sub(u'(\u2018|\u2019|\u201c|\u201d)', "", text)
        text = re.sub(u'cid:10|cid:9|cid:8|cid:7|cid:13|cid:14|cid:15', "",
                      text)
        text = re.sub(u'\ufb01', "fi", text)
        text = re.sub(u'\ufb02', "fl", text)
        text = re.sub(u'\xa0', ' ', text)

        text = text.replace('()', '')

        txt_tokens = text.split()

        for a in range(0, len(txt_tokens) - 1):
            r = ''
            #print "-------", txt_tokens[a], title_tokenscopy[0]
            if txt_tokens[a] == title_tokenscopy[0]:
                r += txt_tokens[a]
                for k in range(1, len(title_tokenscopy)):
                    a += 1
                    r += ' ' + txt_tokens[a]

                k += 1
                #print ''.join(r),q[i][1],'   ',fuzz.ratio(''.join(r),q[i][1])

                if (len(title_tokenscopy) > 3
                        and fuzz.ratio(''.join(r), q[i][1]) > 92) or (
                            len(title_tokenscopy) <= 3
                            and fuzz.ratio(''.join(r), q[i][1]) == 100):
                    #print txt_tokens[a:a+11]
                    if not txt_tokens[a + 1].isdigit() and (
                            txt_tokens[a + 1][0].isalpha()
                            or '.' in txt_tokens[a + 1]):
                        eflag = 0
                        add_node = 0
                        no_lines = 1
                        print q[i][1]
                        print 'Match Found'
                        string = ""
                        a += 1
                        for y in range(a, len(txt_tokens)):
                            if func(txt_tokens, txt_tokens[y], y,
                                    nxttitle_tokenscopy) == 1:
                                #print txt_tokens[y]
                                string += txt_tokens[y] + " "
                            else:
                                found = 1
                                break
                        # to remove exact repitition of title in content caused due to the problem with the pdf format
                        for z in net_graph.predecessors(
                                q[i][1]) + net_graph.successors(q[i][1]):
                            print z
                            string = ''.join(string.split(z))

                        string = ''.join(string.split(q[i][1]))

                        if not string.isspace() and string != '':
                            add_node = check(q[i][1], q[i + 1][1])
                        else:
                            eflag = 1

                        parser1 = PlaintextParser.from_file(
                            string, Tokenizer("english"))
                        parser2 = PlaintextParser.from_file(
                            q[i][1] + book_name + ' ' + ' '.join(
                                net_graph.predecessors(q[i][1]) +
                                net_graph.successors(q[i][1])),
                            Tokenizer("english"))

                        #print parser.document
                        summarizer = LexRankSummarizer()

                        lr_score = summarizer(parser1.document,
                                              parser2.document, 5)

                        #print ' '.join(net_graph.predecessors(q[i][1])+net_graph.successors(q[i][1]))

                        no_lines = len(string.split('.'))
                        #print no_lines

                        if add_node:
                            graph_list.append([(int)(q[i][0]), q[i][1], 0])
                            graph_list.append([(int)(q[i][0]) + 1,
                                               'Chapter Introduction',
                                               lr_score / no_lines])
                        else:
                            if eflag == 1:
                                lr_score = 0
                            graph_list.append([(int)(q[i][0]), q[i][1],
                                               lr_score / no_lines])

                        #print q[i][1]+book_name+' '+' '.join(net_graph.predecessors(q[i][1])+net_graph.successors(q[i][1]))
                        print string
                        #print lr_score
                        #print lr_score/no_lines

                if found == 1:
                    break

        #print [z[1] for z in graph_list]

        if q[i][0] == '1' and q[i][1] not in [z[1] for z in graph_list]:
            print 'added'
            graph_list.append([(int)(q[i][0]), q[i][1], 0])

    #print i ,"last"

    #for the last bookmark
    lastindex = len(q) - 1
    #print skip,' last'
    if q[i][1].lower() in badtags:
        print q[i][1], ' - skipped'
    if i == lastindex and q[i][1].lower() not in badtags:

        fp = open(pdfname, "rb")
        pdf = PdfFileReader(fp)
        totalPages = pdf.getNumPages()
        pageno = page_title[(str)(q[lastindex][1])]
        title_tokenscopy = q[i][1].split()
        text = convert_page_to_txt(pdfname, pageno, totalPages)
        txt_tokens = text.split()
        for a in range(0, len(txt_tokens)):
            #print"entry 1"
            found = 0
            if txt_tokens[a] == title_tokenscopy[0]:
                #print "entry entry"
                k = 0
                for k in range(1, len(title_tokenscopy)):
                    #print "entry 2"
                    a += 1
                    if txt_tokens[a] == title_tokenscopy[k]:
                        #print "entry 3"
                        continue
                    else:
                        #print "entry 4"
                        break
                    #if len(titlee_tokens)
                    #k+=1
                print k
                if k + 1 == len(title_tokenscopy) or (len(title_tokenscopy)
                                                      == 1 and k == 1):
                    if not txt_tokens[a + 1].isdigit() and txt_tokens[
                            a + 1][0].isalpha():
                        print 'Match Found'
                        found = 1
                        string = ""
                        a += 1
                        for y in range(a, len(txt_tokens)):
                            string += txt_tokens[y] + " "
                        print string
                        print(
                            "\t\t********************************************************************************************************\n"
                        )
                    if found:
                        print "the end"
                        break

    for i in range(0, len(graph_list)):
        print graph_list[i]
コード例 #7
0
ra = " "
for sa in neg_ben:
    ra = ra + sa

ra = ra.replace("'", '"')
ra = ra.replace('"', "'")

from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

lexi = LexRankSummarizer(Stemmer("english"))
texi = TextRankSummarizer(Stemmer("english"))

parser = PlaintextParser.from_string(ra, Tokenizer("english"))

texi = TextRankSummarizer(Stemmer("english"))

rentence = "dddd"
for sentence in texi(parser.document,
                     20):  # This does indeed summarise the document

    if (str(rentence).split()[len(str(rentence).split()) - 1][-1]
            == ".") and (len(rentence) > 2):
        rentence = rentence + " " + str(sentence)
    elif len(rentence) < 3:
        rentence = rentence + " " + str(sentence)
コード例 #8
0
ファイル: CodSandra.py プロジェクト: a23ls/Testar
from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.lsa import LsaSummarizer #We're choosing Luhn, other algorithms are also built in
from sumy.summarizers.lex_rank import LexRankSummarizer


<<<<<<< HEAD
<<<<<<< HEAD
for i in range(18,267):
    page = read_pdf.getPage(i)
    page_content = page.extractText().replace("\n","")
    parser = PlaintextParser.from_string(page_content, Tokenizer("portuguese"))
    summarizer_lsa = LsaSummarizer()
    summaries.append(list(summarizer_lsa(parser.document,3)))
    parser_rank = PlaintextParser.from_string(page_content, Tokenizer("portuguese"))
    summarizer_lex = LexRankSummarizer()
    summaries_Rank.append(list(summarizer_lex(parser_rank.document,3)))

print(pdf_file)
## segunda modificação
##
=======



## 
## pequena modificação

=======

>>>>>>> 4aafb23218f88657483e1737f400ac76a22efd92
コード例 #9
0
ファイル: run_sumy.py プロジェクト: abiraja2004/awesome_nlp
def run_LexRank(stemmer, document):
    lex = LexRankSummarizer(stemmer)
    lex.stop_words = get_stop_words(LANGUAGE)
    print("LexRank")
    return [x for x in lex(document, SENTENCES_COUNT)]
コード例 #10
0
def main():
    st.sidebar.header("Settings")

    article = st.sidebar.selectbox('Select article', ['Health Data', 'Corona Virus', 'Huawei'])
    if article == 'Health Data':
        article_file = 'text_sample_1.txt'
    elif article == 'Corona Virus':
        article_file = 'text_sample_2.txt'
    else:
        article_file = 'text_sample_3.txt'
    
    # GET DATA
    text = load_data(article_file)
    
    # TABLE
    st.sidebar.subheader('Data view')
    if st.sidebar.checkbox('Show Full Text', False):
        '''
        ### Data
        '''
        text

    # TABLE
    st.sidebar.subheader('Summary view')
    if st.sidebar.checkbox('Gensim Summary', True):
        '''
        ### Gensim Summary
        '''
        sentences_ratio = st.sidebar.slider('Ratio of sentences in summary', 0.05, 1.0, 0.25, 0.05)
        gensim_summary_list = summarize(text, ratio=sentences_ratio, split=True)
        gensim_summary = ' '.join(gensim_summary_list)
        gensim_summary

    if st.sidebar.checkbox('Sumy Summary', True):
        '''
        ### Sumy Summary
        '''
        num_sentences = st.sidebar.slider('Number of sentences in summary', 1, 15, 9, 1)
        # https://www.aaai.org/Papers/JAIR/Vol22/JAIR-2214.pdf
        parser = PlaintextParser.from_string(text,Tokenizer("english"))
        lex_summarizer = LexRankSummarizer()
        sumy_lex_rank = lex_summarizer(parser.document,num_sentences)
        sumy_summary_list = [str(sentence) for sentence in sumy_lex_rank]
        sumy_summary = ' '.join(sumy_summary_list)
        sumy_summary

    if st.sidebar.checkbox('Tf-Idf Summary', True):
        '''
        ### Tf-Idf Summary
        '''
        threshold = st.sidebar.slider('Tf-Idf threshold factor', 0.0, 1.0, 0.75, 0.01)
        tf_idf_summary = tf_idf().summarize(text, threshold)
        tf_idf_summary
   
    # ABOUT
    st.sidebar.header('About')
    st.sidebar.info('**Text Summarization App**\n' + \
        'Examples using `Gensim`, `Sumy` and `NLTK + custom Tf-Idf` implementations.\n\n' + \
        '(c) 2020. Oxford Economics Ltd. All rights reserved.')
    st.sidebar.markdown('---')

    # Display Readme.md
    if st.sidebar.checkbox('Readme', False):
        st.markdown('---')
        '''
        ### Readme
        '''
        with open('./README.md', 'r', encoding='utf-8') as f:
            readme = f.read()
            st.markdown(readme)

    # Style
    st.sidebar.markdown('---')
    if st.sidebar.checkbox('Configure Style'):
        BlockContainerStyler().block_container_styler()
コード例 #11
0
ファイル: rank.py プロジェクト: rab248b/SoccerEvent
 def sumyTest(self):
     text = '"Substitution sub-out Wilfried Zaha sub-in James McArthur . Palace\'s second change sees the arrival of sub-in James McArthur, with Zaha surprisingly giving way."'
     parser = PlaintextParser.from_string(text, Tokenizer("english"))
     summarizer = LexRankSummarizer()
     print summarizer(parser.document, 1)
コード例 #12
0
ファイル: utils.py プロジェクト: aasimkhan30/fake-net
def txt_summary(doc, sentences_num):
    parser = PlaintextParser.from_string(doc, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, sentences_num)
    return summary
コード例 #13
0
def summarize(request):
    """Responds to any HTTP request.
    Args:
        request (flask.Request): HTTP request object.
    Returns:
        The response text or any set of values that can be turned into a
        Response object using
        `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
    """
    # request_json = request.get_json()
    # if request.args and 'message' in request.args:
    #     return request.args.get('message')
    # elif request_json and 'message' in request_json:
    #     return request_json['message']
    # else:
    #     return f'Hello World!'
    try:
        if request.method == 'OPTIONS':
            # Allows GET requests from any origin with the Content-Type
            # header and caches preflight response for an 3600s
            headers = {
                'Access-Control-Allow-Origin': '*',
                'Access-Control-Allow-Methods':
                'GET, POST, PUT, PATCH, DELETE, OPTIONS',
                'Access-Control-Allow-Headers':
                'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization',
                'Access-Control-Expose-Headers':
                'Content-Length,Content-Range',
                'Access-Control-Max-Age': '3600'
            }
            return ('', 204, headers)

        headers = {
            'Access-Control-Allow-Origin': '*',
        }
        request_json = request.get_json()
        document = request_json['value']
    except:  #for local try using py main.py
        headers = None
        document = request['value']
    finally:

        parser = PlaintextParser.from_string(document, Tokenizer("english"))

        summaries = {}
        number_pool = [0, 1, 2, 3]
        random.shuffle(number_pool)
        print(number_pool)

        for i in range(len(number_pool)):
            if number_pool[i] == 0:
                summarizer = LexRankSummarizer()
            if number_pool[i] == 1:
                summarizer = LuhnSummarizer()
            if number_pool[i] == 2:
                summarizer = LsaSummarizer(Stemmer("english"))
                summarizer.stop_words = get_stop_words("english")
            if number_pool[i] == 3:
                summarizer = PureNLTKSummarizer()

            summary = summarizer(parser.document, 3)
            sum_string = []
            for sentence in summary:
                sum_string.append(str(sentence))
            summaries[f'{i}'] = " ".join(sum_string)
        if headers is None:
            return summaries
        return (summaries, 200, headers)
コード例 #14
0
    def test(self, summaryAlgo="LexRank"):
        """
            Runs the algorithm on test set data
        """
        with open("objects//model" + '.pkl', 'rb') as fm:
            [
                self.model1, self.count_vectorizer1, self.model2,
                self.count_vectorizer2
            ] = pickle.load(fm)
        with open("objects//test_entities" + '.pkl', 'rb') as fe:
            lst = pickle.load(fe)
        removeDic = lst[0]
        self.directory = "demoData/testData"
        fileList = []
        label1 = array(['action', 'Not action'])
        label12 = array([
            u'action', u'yellow-card', u'substitution', u'assist', u'goal',
            u'penalty-goal', u'red-card', u'own-goal', u'missed-penalty',
            u'penalty-save', u'yellow-red'
        ])
        confusion1 = np.array([[0 for x in range(len(label1))]
                               for y in range(len(label1))])
        confusion12 = np.array([[0 for x in range(len(label12))]
                                for y in range(len(label12))])
        scores1 = []
        scores12 = []
        if (summaryAlgo == "TextRank"):
            summarizer = TextRankSummarizer()
        else:
            summarizer = LexRankSummarizer()

        for file in os.listdir(self.directory):
            if file.endswith(".csv") and not (os.stat(self.directory + "\\" +
                                                      file).st_size == 0):
                fileList.append(file)
        for file in fileList:
            removeList = removeDic[file]
            [data1, data2] = self.build_data_frame(file, removeList)
            testCount1 = self.count_vectorizer1.transform(
                data1['words'].values)
            predicted1 = self.model1.predict(testCount1)
            testy1 = data1['class1'].values
            data12 = data1[predicted1 == "Not action"]
            testy12 = data1['class2'].values
            testCount12 = self.count_vectorizer2.transform(
                data12['words'].values)
            predicted12 = self.model2.predict(testCount12)
            score1 = f1_score(testy1, predicted1, pos_label="Not action")
            confusion1 += confusion_matrix(testy1, predicted1, labels=label1)
            scores1.append(score1)
            predicted1[predicted1 == "Not action"] = predicted12
            score12 = f1_score(testy12, predicted1, average='weighted')
            print score1, score12
            confusion12 += confusion_matrix(testy12,
                                            predicted1,
                                            labels=label12)
            scores12.append(score12)
            for index in data12.index:
                commentary = data12['commentary'][index]
                commentary = commentary[1:-1]
                minute = data12['minute'][index]
                parser = PlaintextParser.from_string(commentary,
                                                     Tokenizer("english"))
                summary = summarizer(parser.document, 1)
                print minute, [sentence for sentence in summary]
        print('Total commentary classified:', len(data1.commentary.values))
        print('Score1:', sum(scores1) / len(scores1))
        print('Confusion matrix1:')
        print(confusion1)
        print('Score2:', sum(scores12) / len(scores12))
        print('Confusion matrix2:')
        print(confusion12)
コード例 #15
0
def lexrank(doc, refsum):
    stemmer = Stemmer("english")
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words("english")
    summary = summarize(doc, summarizer)
    return evaluate(summary, refsum)
コード例 #16
0
def analyze_text(text):
    ret = {}
    # language identification
    language = settings.LANG_ID.classify(text)[0]
    lang = settings.LANGUAGE_MODELS[language]
    ret = {}
    doc = lang(cleanMe(text))
    ret['language'] = settings.LANGUAGE_MAPPING[language]
    # analyzed text containing lemmas, pos and dep. Entities are coloured
    analyzed_text = ''
    for token in doc:
        if token.ent_type_:
            analyzed_text += '<span class="tooltip" data-content="POS: {0}<br> LEMMA: {1}<br> DEP: {2}" style="color: red;" >{3} </span>'.format(
                token.pos_, token.lemma_, token.dep_, token.text)
        else:
            analyzed_text += '<span class="tooltip" data-content="POS: {0}<br> LEMMA: {1}<br> DEP: {2}" >{3} </span>'.format(
                token.pos_, token.lemma_, token.dep_, token.text)

    ret['text'] = analyzed_text

    # Text category. Only valid for Greek text for now
    if language == 'en':
        ret.update(sentiment_analysis(doc))
        try:
            ret['category'] = predict_category(text, language)
        except Exception:
            pass
    try:

        parser = PlaintextParser.from_string(cleanMe(text),
                                             Tokenizer("english"))

        # Using LexRank
        summarizer = LexRankSummarizer()
        #Summarize the document with 10 sentences
        summary = summarizer(parser.document, 30)
        s = ''
        for sentence in summary:
            s += ' ' + str(sentence)
            ret['summary'] = s
    except ValueError:
        pass
    # top 10 most frequent keywords, based on tokens lemmatization
    frequency = defaultdict(int)
    lexical_attrs = {
        'urls': [],
        'emails': [],
        'nums': [],
    }
    for token in doc:
        if (token.like_url):
            lexical_attrs['urls'].append(token.text)
        if (token.like_email):
            lexical_attrs['emails'].append(token.text)
        if (token.like_num or token.is_digit):
            lexical_attrs['nums'].append(token.text)
        if not token.is_stop and token.pos_ in [
                'VERB', 'ADJ', 'NOUN', 'ADV', 'AUX', 'PROPN'
        ]:
            frequency[token.lemma_] += 1
    keywords = [
        keyword for keyword, frequency in sorted(
            frequency.items(), key=lambda k_v: k_v[1], reverse=True)
    ][:10]
    ret['keywords'] = ', '.join(keywords)

    # Named Entities
    entities = {label: [] for key, label in ENTITIES_MAPPING.items()}
    for ent in doc.ents:
        # noticed that these are found some times
        if ent.text.strip() not in ['\n', '', ' ', '.', ',', '-', '–', '_']:
            mapped_entity = ENTITIES_MAPPING.get(ent.label_)
            if mapped_entity and ent.text not in entities[mapped_entity]:
                entities[mapped_entity].append(ent.text)
    ret['named_entities'] = entities

    # Sentences splitting
    ret['sentences'] = [sentence.text for sentence in doc.sents]

    # Lemmatized sentences splitting
    ret['lemmatized_sentences'] = [sentence.lemma_ for sentence in doc.sents]

    # Text tokenization
    ret['text_tokenized'] = [token.text for token in doc]

    # Parts of Speech
    part_of_speech = {label: [] for key, label in POS_MAPPING.items()}

    for token in doc:
        mapped_token = POS_MAPPING.get(token.pos_)
        if mapped_token and token.text not in part_of_speech[mapped_token]:
            part_of_speech[mapped_token].append(token.text)
    ret['part_of_speech'] = part_of_speech
    ret['lexical_attrs'] = lexical_attrs
    ret['noun_chunks'] = [
        re.sub(r'[^\w\s]', '', x.text) for x in doc.noun_chunks
    ]
    return ret
コード例 #17
0
ファイル: summarise3.py プロジェクト: parthv21/Smart-Crawler
words = ("deep", "learning", "neural" )
summarizer.bonus_words = words
    
words = ("another", "and", "some", "next",)
summarizer.stigma_words = words
    
words = ("another", "and", "some", "next",)
summarizer.null_words = words

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    summary3+=str(sentence)
    summary3+=" "

with open("summarised_text.txt", "a", encoding="utf8") as myfile:
    myfile.write("\n\nEdmundson:\n")
    myfile.write(summary3)

summary4 = ""
print("\n\n")
print ("--LexRankSummarizer--")
summarizer = LexRankSummarizer()
summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, SENTENCES_COUNT):
    summary4+=str(sentence)
    summary4+=" "

with open("summarised_text.txt", "a", encoding="utf8") as myfile:
    myfile.write("\n\nLexRank:\n")
    myfile.write(summary4)
コード例 #18
0
def sumySummarize(filename, language="english", num_sents=1):
    """
    Luhn's algorithm is the most basic:
    1. Ignore Stopwords
    2. Determine Top Words: The most often occuring words in the document are counted up.
    3. Select Top Words: A small number of the top words are selected to be used for scoring.
    4. Select Top Sentences: Sentences are scored according to how many of the top words they 
    contain. The top N sentences are selected for the summary.
    
    SumBasic uses a simple concept:
    1. get word prob. p(wi) = ni/N (ni = no. of times word w exists, N is total no. of words)
    2. get sentence score sj = sum_{wi in sj} p(wi)/|wi| (|wi| = no. of times wi comes in sj)
    3. choose sj with highest score
    4. update pnew(wi) = pold(wi)^2 for words in the chosen sentence (we want probability to include the same words to go down)
    5. repeat until you reach desired no. of sentences
    
    KL algorithm solves arg min_{S} KL(PD || PS) s.t. len(S) <= # sentences, where 
    	KL = Kullback-Lieber divergence = sum_{w} PD(w)log(PD(w)/PS(w))
    	PD = unigram word distribution of the entire document
    	PS = unigram word distribution of the summary (optimization variable)
    
    LexRank and TextRank use a PageRank kind of algorithm
    1. Treat each sentence as the node in the graph
    2. Connect all sentences to get a complete graph (a clique basically)
    3. Find similarity between si and sj to get weight Mij of the edge conecting i and j
    4. Solve the eigen value problem Mp = p for similarity matrix M.
    5. L = 0.15 + 0.85*Mp.  L gives the final score for each sentence.  Pick the top sentences
    LexRank uses a tf-idf modified cosine similarity for M.  TextRank uses some other similarity metric
    
    LSA uses a SVD based approach
    1. Get the term-sentence matrix A (rows is terms, columns is sentences).  Normalize with term-frequency (tf) only
    2. Do SVD; A = USV' (A=m x n, U=m x n, S=n x n, V=n x n)
    SVD derives the latent semantic structure of sentences.  The k dimensional sub-space get the key k topics
    of the entire text structure.  It's a mapping from n-dimensions to k
    If a word combination pattern is salient and recurring in document, this
    pattern will be captured and represented by one of the singular vectors. The magnitude of the
    corresponding singular value indicates the importance degree of this pattern within the
    document. Any sentences containing this word combination pattern will be projected along
    this singular vector, and the sentence that best represents this pattern will have the largest
    index value with this vector. As each particular word combination pattern describes a certain
    topic/concept in the document, the facts described above naturally lead to the hypothesis that
    each singular vector represents a salient topic/concept of the document, and the magnitude of
    its corresponding singular value represents the degree of importance of the salient
    topic/concept.
    Based on this, summarization can be based on matrix V.  V describes an importance degree 
    of each topic in each sentence. It means that the k’th sentence we choose has the largest 
    index value in k’th right singular vector in matrix V.  An extension of this is using 
    SV' as the score for each sentence
    """
    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words
    from sumy.summarizers.luhn import LuhnSummarizer
    from sumy.summarizers.lsa import LsaSummarizer
    from sumy.summarizers.text_rank import TextRankSummarizer
    from sumy.summarizers.lex_rank import LexRankSummarizer
    from sumy.summarizers.sum_basic import SumBasicSummarizer
    from sumy.summarizers.kl import KLSummarizer

    parser = PlaintextParser.from_file(filename, Tokenizer(language))

    def getSummary(sumyAlgorithm):
        sumyAlgorithm.stop_words = get_stop_words(language)
        summary = sumyAlgorithm(parser.document, num_sents)
        sents = " ".join([str(sentence) for sentence in summary])
        return sents

    stemmer = Stemmer(language)

    summaries = {}
    summaries['Luhn'] = getSummary(LuhnSummarizer(stemmer))
    summaries['LSA'] = getSummary(LsaSummarizer(stemmer))
    summaries['TextRank'] = getSummary(TextRankSummarizer(stemmer))
    summaries['LexRank'] = getSummary(LexRankSummarizer(stemmer))
    summaries['SumBasic'] = getSummary(SumBasicSummarizer(stemmer))
    summaries['KL'] = getSummary(KLSummarizer(stemmer))

    print("")
    print("####### From Sumy #######")
    print(summaries)
コード例 #19
0
ファイル: extractive.py プロジェクト: martinhartt/HGfGT
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.nlp.tokenizers import Tokenizer
import sys


def leadSummariser(document, no_of_sents):
    for sent in document.sentences[:no_of_sents]:
        yield str(sent)


summarisers = {
    "lead": leadSummariser,
    "luhn": LuhnSummarizer(),
    "lsa": LsaSummarizer(),
    "lex_rank": LexRankSummarizer(),
    "text_rank": TextRankSummarizer(),
    "sum_basic": SumBasicSummarizer(),
    "kl": KLSummarizer()
}

tokenizer = Tokenizer("english")


def to_words(str):
    return str.split(" ")


def extractive(article, title=None):
    raw = article.replace(' <sb>', '').strip()
コード例 #20
0
def lexranker(text, count):
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = LexRankSummarizer()
	summary = summarizer(parser.document, count)

	return summary
コード例 #21
0
 def __summarize(self, parser):
     summarizer = LexRankSummarizer()
     summarizer.stop_words = get_stop_words(self.__language)
     final_sentences = summarizer(parser.document, self.__sentences_count)
     return self.__join_sentences(final_sentences)
コード例 #22
0
def summarization(id):
    summarizer = LexRankSummarizer()
    """Summarization and Factors influnce for POSITIVE feedbacks"""
    pos_query = Feedback.query.filter_by(sentiment='POSITIVE').filter_by(
        session=id).all()
    neg_query = Feedback.query.filter_by(sentiment='NEGATIVE').filter_by(
        session=id).all()
    if len(pos_query) == 0 and len(neg_query) == 0:
        return "0"
    else:
        pos_text = ""
        for i in range(len(pos_query)):
            pos_text = pos_text + str(pos_query[i].description)

        cleaned_pos_text = pos_text.lower().translate(
            str.maketrans('', '', string.punctuation))
        tokenized_pos_words = word_tokenize(cleaned_pos_text, "english")
        final_pos_words = []
        for word in tokenized_pos_words:
            if word not in stopwords.words('english'):
                final_pos_words.append(word)
        """Counting Factors for POSITIVE"""
        w = Counter(final_pos_words)
        a = {}
        for x in List_of_factor:
            if x in w.keys():
                a[x] = w[x]
        pos_fact = sorted(a.items(), key=lambda x: x[1], reverse=True)
        """Summary of POSITIVE"""
        parser = PlaintextParser.from_string(pos_text, Tokenizer("english"))
        summ_Pos = ""
        abstract_pos = summarizer(parser.document, 1)
        for sentence in abstract_pos:
            summ_Pos = summ_Pos + str(sentence)
        """Summarization and Factors influnce for NEGATIVE feedbacks"""
        neg_text = ""
        for i1 in range(len(neg_query)):
            neg_text = neg_text + str(neg_query[i1].description)

        cleaned_neg_text = neg_text.lower().translate(
            str.maketrans('', '', string.punctuation))
        tokenized_neg_words = word_tokenize(cleaned_neg_text, "english")
        final_neg_words = []
        for word in tokenized_neg_words:
            if word not in stopwords.words('english'):
                final_neg_words.append(word)
        """Counting Factors for NEGATIVE"""
        w = Counter(final_neg_words)
        b = {}
        for x in List_of_factor:
            if x in w.keys():
                b[x] = w[x]
        neg_fact = sorted(b.items(), key=lambda x: x[1], reverse=True)
        """Summary of NEGATIVE"""
        parser = PlaintextParser.from_string(neg_text, Tokenizer("english"))
        summ_Neg = " "
        abstract_neg = summarizer(parser.document, 1)
        for sentence in abstract_neg:
            summ_Neg = summ_Neg + str(sentence)

        return {
            'cnt_pos': pos_fact[0:5],
            'cnt_neg': neg_fact[0:5],
            'summ_pos': summ_Pos,
            'summ_neg': summ_Neg
        }
コード例 #23
0
total = 0
for file in os.listdir('datafiles'):
    with codecs.open('datafiles/' + file,
                     'r',
                     encoding='utf-8',
                     errors='ignore') as f:
        text = f.read().replace('\n', ' ')
        corpus.append(customtokenize(text))
        parser = PlaintextParser.from_string(text, UrduTokenizer)
        objectDocModel = parser.document
        print(objectDocModel.sentences)
        print(objectDocModel.paragraphs)
        print(objectDocModel.words)
        print(objectDocModel.headings)
        stemmer = Stemmer(LANGUAGE)
        summarizer = LexRankSummarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summ = summarizer(parser.document, SENTENCES_COUNT)
        with open('dataresults/' + file.split('.')[0] + '.txt', 'w') as fw:
            for sentence in summ:
                # print sentence
                evaluated_sentences.append(sentence)
                fw.writelines(str(sentence))
                length += len(str(sentence))
            total += length
            length = 0
        # list of rouge scores (bigrams)
        res = rouge_1(evaluated_sentences, objectDocModel.sentences)
        rouge_scores.append(res)
        evaluated_sentences.clear()
コード例 #24
0
def textteaser_test():

    summary = open("summary_list.txt", "a", encoding='utf-8-sig')
    sys.stdout = summary

    # obtain the input article from url
    #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics"
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # obtain the input article from plain text files
    parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE))

    # define the language, by dafult it is English
    stemmer = Stemmer(LANGUAGE)

    # SumBasic algorithm
    summarizer = SumBasicSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("SumBasic:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LSA algorithm
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("Latent Semantic Analysis:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # TextRank algorithm
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("TextRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    # LexRank algorithm
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    print("LexRank:")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    #Featured-LexRank algorithm
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        first_line = f.readline()
    title = first_line
    with open('input_sample.txt', 'r', encoding='utf-8-sig') as f:
        text = f.read()
    tt = TextTeaser()

    sentences = tt.summarize(title, text)
    file = open("tt.txt", "w", encoding='utf-8-sig')
    print("Featured-LexRank:")
    for sentence in sentences:
        file.write("%s\n" % sentence)
    file.close()

    parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE))
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
    print("\n")

    summary.close()
コード例 #25
0
def index():
    errors = []
    results = {}
    methodSelected = request.form.get('methodSelected')

    if request.method == "POST":
        try:
            output_string = StringIO()
            file = request.files['file']
            filename = secure_filename(file.filename)
            file.save(os.path.join(tempdirectory, filename))
            with open(os.path.join(tempdirectory, filename), 'rb') as in_file:
                parser = PDFParser(in_file)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                codec = 'utf-8'
                device = TextConverter(rsrcmgr,
                                       output_string,
                                       codec=codec,
                                       laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)

            textImport = ''
            textImport = output_string.getvalue()
            # os.remove(filename)
            # file=open(os.path.join(tempdirectory, filename), encoding="utf-8")
            # textImport = file.read()
        except:
            errors.append("Only PDF supported")
            return render_template('index.html', errors=errors)
        if textImport:
            if methodSelected == 'textrank':
                nltk.data.path.append('./nltk_data/')  # set the path
                # tok = tokenizer.tokenize(textImport)    #word tokenization
                # result = [i for i in tok if not i in stop_words]    #stop word removal
                # final=[""]
                # for word in result:
                #     final.append(stemmer.stem(word))   #stemming

                # final2=[""]
                # for word in final:
                #     final2.append(lemmatizer.lemmatize(word))   #lemmatization
                # print(final2)
                # results = final2

                sentences = []
                sentences.append(sent_tokenize(textImport))
                sentences = [y for x in sentences for y in x]
                clean_sentences = pd.Series(sentences).str.replace(
                    "[^a-zA-Z]", " ")
                clean_sentences = [s.lower() for s in clean_sentences]

                def remove_stopwords(sen):
                    sen_new = " ".join([i for i in sen if i not in stop_words])
                    return sen_new

                # remove stopwords from the sentences
                clean_sentences = [
                    remove_stopwords(r.split()) for r in clean_sentences
                ]

                # Extract word vectors
                word_embeddings = {}
                f = open('glove.6B.100d.txt', encoding='utf-8')
                for line in f:
                    values = line.split()
                    word = values[0]
                    coefs = np.asarray(values[1:], dtype='float32')
                    word_embeddings[word] = coefs
                f.close()

                sentence_vectors = []
                for i in clean_sentences:
                    if len(i) != 0:
                        v = sum([
                            word_embeddings.get(w, np.zeros((100, )))
                            for w in i.split()
                        ]) / (len(i.split()) + 0.001)
                    else:
                        v = np.zeros((100, ))
                    sentence_vectors.append(v)

                len(sentence_vectors)

                # similarity matrix
                sim_mat = np.zeros([len(sentences), len(sentences)])

                for i in range(len(sentences)):
                    for j in range(len(sentences)):
                        if i != j:
                            sim_mat[i][j] = cosine_similarity(
                                sentence_vectors[i].reshape(1, 100),
                                sentence_vectors[j].reshape(1, 100))[0, 0]

                nx_graph = nx.from_numpy_array(sim_mat)
                scores = nx.pagerank(nx_graph)

                ranked_sentences = sorted(
                    ((scores[i], s) for i, s in enumerate(sentences)),
                    reverse=True)

                # Specify number of sentences to form the summary
                sn = 10
                temp = []
                # # Generate summary
                for i in range(sn):
                    temp.append(ranked_sentences[i][1])

                results = temp

            if methodSelected == 'lexrank':
                print('lexrank selected')
                ParsedOutputLexrank = PlaintextParser.from_string(
                    textImport, Tokenizer("english"))
                summarizer = LexRankSummarizer()
                summaryOutputLexrank = summarizer(ParsedOutputLexrank.document,
                                                  10)

                for sentence in summaryOutputLexrank:
                    print(sentence)

                results = ''.join(map(str, summaryOutputLexrank))

            if methodSelected == 'lsa':
                print('lsa selected')
                ParsedOutputLexrank = PlaintextParser.from_string(
                    textImport, Tokenizer("english"))
                summarizer_lsa = LsaSummarizer()
                summaryOutputLSA = summarizer_lsa(ParsedOutputLexrank.document,
                                                  10)

                # for sentence in summaryOutputLSA:
                #     print(sentence)

                results = ''.join(map(str, summaryOutputLSA))

        output_string.close()

    return render_template('index.html', errors=errors, results=results)
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 5

if __name__ == "__main__":
    # url = "https://en.wikipedia.org/wiki/Tesla,_Inc."

    # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    parser = PlaintextParser.from_file("text", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)

    print("\n\nLexRankSummarizer")
    summarizer_2 = LexRankSummarizer()
    summary_2 = summarizer_2(parser.document, 2)

    for line in summary_2:
        print(line)

    # print (parser.document)
コード例 #27
0
from nltk.tokenize import sent_tokenize, word_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

file = "inputtext.txt"
parser = PlaintextParser.from_file(file, Tokenizer("english"))
summarizer = LexRankSummarizer()

summary = summarizer(parser.document, 10)

for sentence in summary:
    print(sentence)
コード例 #28
0
def gaz(type_df, time, cut, many):
    nlp = spacy.load('en')

    if cut == "True":
        type_df = type_df[type_df["Review Date"] > time]
    else:
        type_df = type_df[type_df["Review Date"] < time]

    sample_review = ""
    for i in type_df["review"]:
        sample_review = sample_review + " " + str(i)

    # print(sample_review)

    len(sample_review)

    sample_review = sample_review.replace("\\", "")

    #### Summary:

    ### Summaries
    import sumy

    from sumy.summarizers.lex_rank import LexRankSummarizer
    from sumy.summarizers.text_rank import TextRankSummarizer

    from sumy.parsers.plaintext import PlaintextParser
    from sumy.nlp.tokenizers import Tokenizer
    from sumy.nlp.stemmers import Stemmer
    from sumy.utils import get_stop_words

    lexi = LexRankSummarizer(Stemmer("english"))
    texi = TextRankSummarizer(Stemmer("english"))

    parser = PlaintextParser.from_string(sample_review, Tokenizer("english"))

    texi = TextRankSummarizer(Stemmer("english"))

    rentence = "dddd"
    for sentence in texi(parser.document,
                         10):  # This does indeed summarise the document
        if (str(rentence).split()[len(str(rentence).split()) - 1][-1]
                == ".") and (len(rentence) > 2):
            rentence = rentence + " " + str(sentence)
        elif len(rentence) < 3:
            rentence = rentence + " " + str(sentence)
        else:
            rentence = rentence + ". " + str(sentence)

    stop_words = set(stopwords.words('english'))
    stop_words.update([
        '.', ',', '"', "'", '?', '!', '! !', ':', ';', '(', ')', '[', ']', '{',
        '}'
    ])  # remove it if you need punctuation

    list_of_words = [
        i.lower() for i in wordpunct_tokenize(sample_review)
        if i.lower() not in stop_words
    ]

    final = ' '.join(list_of_words)

    from nltk.tokenize import RegexpTokenizer

    tokenizer = RegexpTokenizer(r'\w+')
    list_of_words = tokenizer.tokenize(final)
    final = ' '.join(list_of_words)

    parsed_review = nlp(final)

    # print(parsed_review)

    token_text = [token.orth_ for token in parsed_review]
    token_pos = [token.pos_ for token in parsed_review]

    df = pd.DataFrame({'token_text': token_text, 'part_of_speech': token_pos})

    # Unigrams
    import nltk
    from nltk import word_tokenize
    from nltk.util import ngrams
    from collections import Counter

    token = nltk.word_tokenize(str(parsed_review))
    grams = ngrams(token, many)

    dra = Counter(grams)

    t = pd.DataFrame()

    f = pd.DataFrame(list(dra.keys()))

    if many == 2:
        f[0] = f[0] + " " + f[1]

    if many == 3:
        f[0] = f[0] + " " + f[1] + " " + f[2]

    f = f[0]

    t["name"] = f
    t["count"] = list(dra.values())

    df = df.drop_duplicates()
    r = pd.merge(t,
                 df,
                 left_on=["name"],
                 right_on=["token_text"],
                 how="left",
                 right_index=False)
    r = r.drop("token_text", axis=1)
    r.columns = ["name", "count", "pos"]

    scaler = MinMaxScaler()
    r["norm"] = scaler.fit_transform(r["count"].values.reshape(-1, 1))

    if many == 1:
        dfs = r[r["pos"] == "NOUN"].sort_values("count", ascending=False)
    else:
        dfs = r.sort_values("count", ascending=False)

    return dfs, rentence
コード例 #29
0
ファイル: utils.py プロジェクト: jeffreyzhang2001/PSumm
 def lex_rank_summarize(self):
     summarizer = LexRankSummarizer()
     summarizer.stop_words = self.stop_words
     summary_tuple = (summarizer(self.parser.document, 4))
     lex_rank_summary = " ".join(map(str, summary_tuple))
     return lex_rank_summary
コード例 #30
0
 def __init__(self):
   self.lsa_summarizer = LsaSummarizer(stemmer)
   self.lex_rank_summarizer = LexRankSummarizer(stemmer)
   self.lsa_summarizer.stop_words = get_stop_words(LANGUAGE)
   self.lex_rank_summarizer.stop_words = get_stop_words(LANGUAGE)
   self.email_text_parser = SbEmailTextParser()