Beispiel #1
0
def text_summary():
    def set_stopwords(stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i + 1, i + window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = symmetrize(g)
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(
            g, norm, where=norm != 0)  # this is ignore the 0 element in norm
        return g_norm

    def analyze(text,
                candidate_pos=['NOUN', 'PROPN', 'VERB'],
                window_size=4,
                lower=False,
                stopwords=list(),
                number=10):
        """Main function to analyze text"""

        # Set stop words
        set_stopwords(stopwords)
        # Pare text by spaCy
        doc = nlp(text)
        # Filter sentences
        sentences = sentence_segment(doc, candidate_pos,
                                     lower)  # list of list of words
        # Build vocabulary
        vocab = get_vocab(sentences)
        # Get token_pairs from windows
        token_pairs = get_token_pairs(window_size, sentences)
        # Get normalized matrix
        g = get_matrix(vocab, token_pairs)
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        # Iteration
        previous_pr = 0
        for epoch in range(steps):
            pr = (1 - d) + d * np.dot(g, pr)
            if abs(previous_pr - sum(pr)) < min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        node_weight = node_weight
        node_weight = OrderedDict(
            sorted(node_weight.items(), key=lambda t: t[1], reverse=True))
        keyword = []
        for i, (key, value) in enumerate(node_weight.items()):
            keyword.append(key)
            #print(key + ' - ' + str(value))
            if i > number:
                break
        return keyword

    def command_detected(sentence):
        # Detects whether a given String sentence is a command or action-item
        tagged_sentence = pos_tag(word_tokenize(sentence))
        first_word = tagged_sentence[0]
        pos_first = first_word[1]
        first_word = first_word[0].lower()
        for word in prohibited_command_words:
            if word in sentence:
                return False
        for word in command_words:
            if word in sentence:
                return True
        # Checks whether the first sentence is a Modal Verb or other type of Verb that is not a gerund
        if (pos_first == "VB" or pos_first == "VBZ"
                or pos_first == "VBP") and first_word[-3:] != "ing":
            return True
        return False

    def retrieve_action_items():
        # Returns a list of the sentences containing action items.
        action_items = []
        for sentence in tokenized_transcript:
            possible_command = command_detected(str(sentence))
            if possible_command is True:
                action_items += [(str(sentence))]
        return action_items

    text = request.json
    text = text['data'].replace('Speaker ', '')
    source = re.sub(r'\d\s+\d{1,2}\:\d{2}', '', text)
    source = re.sub(r'\s+', ' ', source)

    Keywords = analyze(source,
                       candidate_pos=['NOUN', 'PROPN', 'VERB'],
                       window_size=4,
                       lower=False)

    tokenized_transcript = sent_tokenize(source)
    LANGUAGE = "English"
    parser = PlaintextParser.from_string(source, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    summary = summarizer(parser.document, len(tokenized_transcript) * 0.05)
    transcript_summary = []
    for sentence in summary:
        transcript_summary.append(str(sentence))

    command_words = [
        "can you", "would you", "can we", "you should", "we should",
        "we need to", "you need to", "ensure", "make sure", "make it",
        "we want to", "we must", "you must", "you have to", "we have to"
        "homework"
    ]
    prohibited_command_words = ["Let me", "?"]
    Action_item = retrieve_action_items()

    result = {
        "keywords :": Keywords,
        'Summary :': transcript_summary,
        'Action Items :': Action_item
    }
    return jsonify(result)
Beispiel #2
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = 2
INPUTSTRING = "Text summarization is a difficult challenge that is faced by  NLP researchers. Currently I am experimenting with a few text-summarization algorithms in my projects. One of them is LexRank. It is a graph based algorithm that uses a similarity function(cosine similarity in the original paper) to compute similarities between different sentences. It uses a pre-defined threshold to build the graph of the documents, creating an edge between 2 sentences(nodes) every time the similarity is above the threshold. They also used a Pagerank-like scheme to rank the sentences(nodes)."

if __name__ == "__main__":
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    parser = PlaintextParser.from_string(INPUTSTRING, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
Beispiel #3
0
def summary(url, length, LANGUAGE):
    language = LANGUAGE.lower()
    e = str() #capture error

    article = Article(url)
    try:    
        article.download()
        print ('  successfully d/l')
        article.parse()
        raw_html = article.html
        image = article.top_image
        meta = article.meta_description
        text = article.text
    except Exception as e:
        print(e)
 
    if not text:
        print ('  using Readability')
        raw_text = Readability(raw_html, url)
        text = raw_text.content
        article.download(html=text)
        article.parse()
        text = article.text
    if not meta:
        meta = article.title
    meta = unescape(unescape(meta))
    meta = normalize('NFKD', meta)
    meta = meta.strip()
    image = image.replace('(', '\(')
    image = image.replace(')', '\)')
    image_des = '\n\n> [{0}]({1})'.format("**^pic**", image) if image else None  
   
    parser = PlaintextParser(text, Tokenizer(language)) 
    word_count = len(text.split())
    compression = 100
    extra_words = 0
            
    stemmer = Stemmer(language)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(language)        
    short = []
    line = str()
    
    if word_count >= 600:
        length = length + int(log(word_count/600))
    for sentence in summarizer(parser.document, length):
        if str(sentence).strip().lower() in meta.lower():
            extra_words = len(str(sentence).split())
            continue
        line = '>• {0}'.format(sentence)
        line = line.replace("`", "\'")
        line = line.replace("#", "\#")
        short.append(line)
       
    extract = '\n\n'.join(short)
    extract = extract + image_des if image_des else extract
    meta = meta.replace('#', '\#')
    if len(meta) > 400:
       lpoint = meta.rfind('.', 0, 400)
       if lpoint == -1:
           meta = meta[:(meta.rfind(' ', 0, 400))] + '...'
       else:
           meta = meta[:(meta.rfind('.', 0, 400))] + '...'
              
    try:
        compression = int(((extract.count(' ')+extra_words)/word_count)*100)
    except Exception as numerror:
        print(numerror)
    print('  from {0} words to {1} words ({2}%)'.format(word_count, len(extract.split()), compression))
    return (meta, extract, compression, e)
Beispiel #4
0
def lsa_summarizer():
    print ("\n","*"*30, "LSA SUMMARIZER", "*"*30)
    summarizer_lsa = Summarizer(stemmer)
    summarizer_lsa.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer_lsa(parser.document, SENTENCES_COUNT):
        print (sentence)
def analyze(parser):
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
def crawl():
    print(datetime.now())
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8mb4")
    cursor.execute('select id, name, feedUrl, lang, form from sources')
    sources = cursor.fetchall()
    start = time.clock()
    for source in sources:
        # if source['id']%30 == datetime.now().minute%30:
        print(source[0])
        source = {
            'id': source[0],
            'name': source[1],
            'feedUrl': source[2].replace("39.105.127.55", "127.0.0.1"),
            'lang': source[3],
            'form': source[4]
        }
        print(source['name'])
        LANGUAGE = 'chinese'
        if source['lang'] == 2:
            LANGUAGE = 'english'
        items = feedparser.parse(source['feedUrl'])['items']
        for item in items:
            try:
                cursor.execute('select 1 from entries where link = %s limit 1',
                               (item['link'], ))
                results = cursor.fetchall()
                if (not results) or (len(results) == 0):
                    try:
                        entry = {
                            'title':
                            item['title'],
                            'link':
                            item['link'],
                            'source_id':
                            source['id'],
                            'source_name':
                            source['name'],
                            'time':
                            '',
                            'crawl_time':
                            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                            'photo':
                            '',
                            'lang':
                            1,
                            'author':
                            '',
                            'description':
                            '',
                            'digest':
                            '',
                            'content':
                            '',
                            'cluster':
                            0,
                            'sim_count':
                            0,
                            'simhash':
                            '0',
                            'cate11':
                            '',
                            'cate12':
                            '',
                            'cate13':
                            '',
                            'cate21':
                            '',
                            'cate22':
                            '',
                            'cate23':
                            '',
                            'tag1':
                            '',
                            'tag2':
                            '',
                            'tag3':
                            '',
                            'tag4':
                            '',
                            'tag5':
                            '',
                            'video':
                            '',
                            'video_frame':
                            '',
                            'audio':
                            '',
                            'audio_frame':
                            ''
                        }
                        cate1 = ['', '', '']
                        cate2 = ['', '', '']
                        tag = ['', '', '', '', '']
                        ############ Additonal Settings for special sources ##############
                        if entry['source_name'] == 'Hacker News':
                            entry['link'] = item['comments']
                        ###########################

                        if is_en(entry['title']):
                            entry['lang'] = 2
                        if 'published_parsed' in item:
                            try:
                                entry['time'] = datetime.fromtimestamp(
                                    mktime(
                                        item['published_parsed'])) + timedelta(
                                            hours=TZ_DELTA)
                            except Exception as e:
                                entry['time'] = entry['crawl_time']
                                print('Exception when published_parsed: {}'.
                                      format(e))
                        else:
                            entry['time'] = entry['crawl_time']

                        if 'author' in item:
                            entry['author'] = item['author'][0:20]

                        if 'summary' in item:
                            entry['description'] = item['summary'][0:500]

                        if 'content' in item:
                            entry['content'] = item['content'][0]['value'][
                                0:15000]
                        if entry['content'] == '' and 'summary' in item and len(
                                item['summary']) > 0:
                            entry['content'] = item['summary'][0:15000]
                        for field in item['links']:
                            if field['type'] == 'audio/mpeg':
                                if field['href'].endswith('.mp3'):
                                    entry['audio'] = field['href']
                                if field['href'].endswith('.mp4'):
                                    entry['video'] = field['href']

                        #对于文章类entry才进行摘要、聚类、分类、标签
                        if source['form'] == 1:
                            try:
                                if entry['content'] != '':
                                    entry['photo'] = getImg(entry['content'])
                                    if len(entry['photo']) > 255:
                                        entry['photo'] = ''

                                    parser = HtmlParser.from_string(
                                        entry['content'], "",
                                        Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                else:
                                    parser = HtmlParser.from_url(
                                        entry['link'], Tokenizer(LANGUAGE))
                                    stemmer = Stemmer(LANGUAGE)
                                    summarizer = Summarizer(stemmer)
                                    summarizer.stop_words = get_stop_words(
                                        LANGUAGE)
                                    for sentence in summarizer(
                                            parser.document, SENTENCES_COUNT):
                                        entry['digest'] += str(sentence)
                                        if len(entry['digest']) >= 500:
                                            break
                                entry['digest'] = entry['digest'][0:500]
                            except Exception as e:
                                print(
                                    'Exception when getting digest: {}'.format(
                                        e))

                            features = get_features(entry['title'],
                                                    entry['content'])
                            try:
                                entry['simhash'] = str(Simhash(features).value)
                                nears = index.get_near_dups(Simhash(features))
                                if len(nears) > 0:
                                    entry['sim_count'] = len(nears)
                                    cursor.execute(
                                        'select cluster from entries where id = %s',
                                        (int(nears[0]), ))
                                    near_cluster = cursor.fetchone()[0]
                                    entry['cluster'] = near_cluster
                                else:
                                    global last_cluster_num
                                    entry['cluster'] = last_cluster_num
                                    last_cluster_num += 1
                            except Exception as e:
                                print(
                                    'Exception when clustering: {}'.format(e))

                            try:
                                content2 = BeautifulSoup(
                                    entry['content'], "lxml").text.encode(
                                        'gbk', 'ignore').decode(
                                            'gbk')[0:AIP_MAX_LEN_CONTENT]
                                if len(content2) == 0:
                                    if len(entry['digest']) > 0:
                                        content2 = entry['digest']
                                title2 = entry['title'][0:AIP_MAX_LEN_TITLE]
                                keywords = client.keyword(title2, content2)
                                topics = client.topic(title2, content2)
                                i = 0
                                for item in topics['item']['lv1_tag_list']:
                                    cate1[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in topics['item']['lv2_tag_list']:
                                    cate2[i] = item['tag']
                                    i += 1
                                    if i > 2:
                                        break
                                i = 0
                                for item in keywords['items']:
                                    tag[i] = item['tag']
                                    i += 1
                                    if i > 4:
                                        break
                                entry['cate11'] = cate1[0]
                                entry['cate12'] = cate1[1]
                                entry['cate13'] = cate1[2]
                                entry['cate21'] = cate2[0]
                                entry['cate22'] = cate2[1]
                                entry['cate23'] = cate2[2]
                                entry['tag1'] = tag[0]
                                entry['tag2'] = tag[1]
                                entry['tag3'] = tag[2]
                                entry['tag4'] = tag[3]
                                entry['tag5'] = tag[4]
                            except Exception as e:
                                print(
                                    'Exception when categorizing and tagging: {}'
                                    .format(e))

                        elif source['form'] == 2:
                            entry['photo'] = getWeiboImg(entry['content'])
                            entry['digest'] = filterWeiboTags(entry['content'])
                            if len(entry['digest']) > 500:
                                entry['digest'] = entry['digest'][0:500]

                        elif source['form'] == 4:
                            if entry['link'].startswith(
                                    'https://www.bilibili.com/video'):
                                entry['video_frame'] = 'http://player.bilibili.com/player.html?aid=' + \
                                    entry['link'][33:]

                        try:
                            cursor.execute(add_entry, entry)
                            conn.commit()
                            index.add(str(cursor.lastrowid), Simhash(features))
                        except Exception as e:
                            print('Exception when add entry: {}'.format(e))
                    except Exception as e:
                        print("Unexpected Error: {}".format(e))
            except Exception as e:
                print("Unexpected Error: {}".format(e))
        # print(d['feed']['title'])
    elapsed = time.clock() - start
    print('time used: ' + str(elapsed))

    # 关闭Cursor和Connection:
    cursor.close()
Beispiel #7
0
 def __init__(self, language):
     self.language = language
     self.stemmer = Stemmer(self.language)
     self.summarizer = Summarizer(self.stemmer)
     self.summarizer.stop_words = get_stop_words(self.language)
Beispiel #8
0
    def apply_query(self):
        """
         Executes after the 'Generate Results' button is pressed.
        """
        LANGUAGE = "english"
        stemmer = Stemmer(LANGUAGE)
        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        token = Tokenizer(LANGUAGE)

        class Results(object):
            """Dummy object to pass to results functions"""
            def __init__(self):
                self.summary = None
                self.words = None

            def set_summary(self, summary):
                self.summary = summary

            def set_words(self, words):
                self.words = words

        if self.cobj:
            print("Collection object loaded.")

        else:
            from milnlp.collection.collection import Collection, get_items
            self.cobj = Collection(self.collection_path)
            self.cobj.flist, self.cobj.dlist = get_items(
                self.collection_path, set(), {self.collection_path: 0})

        # Build composite document
        if self.radio_full.isChecked():
            method = "full"
        else:
            method = "reduced"

        try:
            buffer_size = self.spin_sentence_buffer.value()

            if self.combo_query.currentIndex() > -1:  # execute for query
                query_to_apply = self.queries[self.combo_query.currentIndex()]
                purge_matches(query_to_apply)
                self.document, self.matches = self.cobj.create_composite_doc_from_query_object(
                    query_to_apply,
                    token,
                    method=method,
                    buffer_size=buffer_size)
                print(f"Using method '{method}' results in: ", self.document)

            else:  # execute for entire sub-collection
                from sumy.models.dom import ObjectDocumentModel
                from ..collection.collection import reparser

                files = [file[:-4] + '.txt' for file in self.cobj.flist]
                parsed_docs = reparser(files, token, method='full')
                composite_doc_paragraphs = []
                for d_parser in parsed_docs.values():
                    composite_doc_paragraphs.extend(
                        d_parser.document.paragraphs)
                self.document, self.matches = ObjectDocumentModel(
                    composite_doc_paragraphs), None

            self.results = Results()

            # Generate message box to confirm the user wants a summary to be generated
            msgBox = QMessageBox()  # todo add window name
            msgBox.setText(
                f"The composite document has been created. It contains {len(self.document.sentences)} "
                f"sentences. Do you want to summarize the document?")
            msgBox.setInformativeText("")
            msgBox.setStandardButtons(QMessageBox.No | QMessageBox.Yes)
            msgBox.setDefaultButton(QMessageBox.Yes)
            ret = msgBox.exec_()
            if ret == QMessageBox.No:
                self.write_matches_panel()
                return 0  # cancel the task

            # Generate summary  # todo # sentences not working
            reduced_summary = self.cobj.summarize_composite(
                self.document, summarizer, self.num_sentences)
            self.results.set_summary(reduced_summary)

            # Generate key words/phrases
            text = doc_to_text(self.document)
            words = dict(score_keyphrases_by_textrank(text, self.num_keywords))
            self.results.set_words(words)

            # Display results
            self.write_matches_panel()
            self.print_results(method='collection')
            self.write_results_panel(method='collection')
            self.tab_widget.setCurrentIndex(2)
        except TypeError:
            # No matches for query, therefore different return route
            msg_box = QMessageBox()
            msg_box.setText(
                'The query returned no matches for the specified collection.')
            msg_box.exec_()
Beispiel #9
0
def index(request):
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    warnings.filterwarnings("ignore", category=FutureWarning)
    template='app/index.html'
    if request.method=='POST':
        url=request.POST['url']
        
        #taking summary of the news from url
        values={'s':'basics','submit':'search'}
        data=urllib.parse.urlencode(values)
        data=data.encode('utf-8')
        req=urllib.request.Request(url,data)
        resp=urllib.request.urlopen(req)
        resp_data=resp.read()
        #print(resp_data)

        paragraphs=re.findall(r'<p>(.*?)</p>',str(resp_data))
        for p in paragraphs:
            clean=cleanhtml(p)
            #clean=remove_html_markup(p) 
            print(clean)
        print("--------------SUMMARY----------------")

        LANGUAGE = "english"
        SENTENCES_COUNT = 15
        parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)
        summary=""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            summary+=str(sentence)
        
        # pasing summary to model for prediction 
        with open('app/static/sav/final_model_6.sav', 'rb') as filename:
            load_model = pickle.load(filename)
        prediction = load_model.predict([summary])
        prob = load_model.predict_proba([summary])
        result=prediction[0]
        score=prob[0][1]

        context={
            'url_text':"Entered URL:",
            'url':url,
            'static_text0':"Summary of the news from URL",
            'static_text':"Result:",
            'static_text2':"The given summary is: ",
            #'static_text3':"The truth probability score is: ",
            'result':result,
            'summary':summary,
            #'score':score
        }
        return render(request,template,context)
    else:
        return render(request,template)

    def classifier():
        
        #building classifier using naive bayes 
        #nb_pipeline = Pipeline([
        #        ('NBCV',FeatureSelection.countV),
        #        ('nb_clf',MultinomialNB())])
        #
        #nb_pipeline.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_nb = nb_pipeline.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_nb == DataPrep.test_news['Label'])
        #
        #
        ##building classifier using logistic regression
        #logR_pipeline = Pipeline([
        #        ('LogRCV',FeatureSelection.countV),
        #        ('LogR_clf',LogisticRegression())
        #        ])
        #
        #logR_pipeline.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_LogR = logR_pipeline.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_LogR == DataPrep.test_news['Label'])
        #
        #
        ##building Linear SVM classfier
        #svm_pipeline = Pipeline([
        #        ('svmCV',FeatureSelection.countV),
        #        ('svm_clf',svm.LinearSVC())
        #        ])
        #
        #svm_pipeline.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_svm = svm_pipeline.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_svm == DataPrep.test_news['Label'])
        #
        #
        ##using SVM Stochastic Gradient Descent on hinge loss
        #sgd_pipeline = Pipeline([
        #        ('svm2CV',FeatureSelection.countV),
        #        ('svm2_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
        #        ])
        #
        #sgd_pipeline.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_sgd = sgd_pipeline.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_sgd == DataPrep.test_news['Label'])
        #
        #
        ##random forest
        #random_forest = Pipeline([
        #        ('rfCV',FeatureSelection.countV),
        #        ('rf_clf',RandomForestClassifier(n_estimators=200,n_jobs=3))
        #        ])
        #    
        #random_forest.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_rf = random_forest.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_rf == DataPrep.test_news['Label'])


        #User defined functon for K-Fold cross validatoin
        def build_confusion_matrix(classifier):
            
            k_fold = KFold(n=len(DataPrep.test_news), n_folds=5)
            scores = []
            confusion = np.array([[0,0],[0,0]])

            for train_ind, test_ind in k_fold:
                train_text = DataPrep.train_news.iloc[train_ind]['Statement'] 
                train_y = DataPrep.train_news.iloc[train_ind]['Label']
            
                test_text = DataPrep.train_news.iloc[test_ind]['Statement']
                test_y = DataPrep.train_news.iloc[test_ind]['Label']
                
                classifier.fit(train_text,train_y)
                predictions = classifier.predict(test_text)
                
                confusion += confusion_matrix(test_y,predictions)
                score = f1_score(test_y,predictions)
                scores.append(score)
            
            return (print('Total statements classified:', len(DataPrep.test_news)),
            print('Score:', sum(scores)/len(scores)),
            print('score length', len(scores)),
            print('Confusion matrix:'),
            print(confusion))
            
        #K-fold cross validation for all classifiers
        #build_confusion_matrix(nb_pipeline)
        #build_confusion_matrix(logR_pipeline)
        #build_confusion_matrix(svm_pipeline)
        #build_confusion_matrix(sgd_pipeline)
        #build_confusion_matrix(random_forest)

        #========================================================================================
        #Bag of words confusion matrix and F1 scores

        #Naive bayes
        # [2118 2370]
        # [1664 4088]
        # f1-Score: 0.669611539651

        #Logistic regression
        # [2252 2236]
        # [1933 3819]
        # f1-Score: 0.646909097798

        #svm
        # [2260 2228]
        # [2246 3506]
        #f1-score: 0.610468748792

        #sgdclassifier
        # [2414 2074]
        # [2042 3710]
        # f1-Score: 0.640874558778

        #random forest classifier
        # [1821 2667]
        # [1192 4560]
        # f1-Score: 0.702651511011
        #=========================================================================================


        """So far we have used bag of words technique to extract the features and passed those featuers into classifiers. We have also seen the
        f1 scores of these classifiers. now lets enhance these features using term frequency weights with various n-grams
        """

        #Now using n-grams
        #naive-bayes classifier
        # nb_pipeline_ngram = Pipeline([
        #         ('nb_tfidf',FeatureSelection.tfidf_ngram),
        #         ('nb_clf',MultinomialNB())])

        # nb_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        # predicted_nb_ngram = nb_pipeline_ngram.predict(DataPrep.test_news['Statement'])
        # np.mean(predicted_nb_ngram == DataPrep.test_news['Label'])
        #
        #
        #
        ##logistic regression classifier
        logR_pipeline_ngram = Pipeline([
                ('LogR_tfidf',FeatureSelection.tfidf_ngram),
                ('LogR_clf',LogisticRegression(penalty="l2",C=1))
                ])
        logR_pipeline_ngram.fit(app.DataPrep.train_news['Statement'],app.DataPrep.train_news['Label'])
        predicted_LogR_ngram = logR_pipeline_ngram.predict(app.DataPrep.test_news['Statement'])
        np.mean(predicted_LogR_ngram == app.DataPrep.test_news['Label'])
        #build_confusion_matrix(logR_pipeline_ngram)
        print("reached end of logR n gram")
        #
        ##linear SVM classifier
        #svm_pipeline_ngram = Pipeline([
        #        ('svm_tfidf',FeatureSelection.tfidf_ngram),
        #        ('svm_clf',svm.LinearSVC())
        #        ])
        #svm_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        #predicted_svm_ngram = svm_pipeline_ngram.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_svm_ngram == DataPrep.test_news['Label'])
        #
        #
        ##sgd classifier
        #sgd_pipeline_ngram = Pipeline([
        #         ('sgd_tfidf',FeatureSelection.tfidf_ngram),
        #         ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
        #         ])
        #sgd_pipeline_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        #predicted_sgd_ngram = sgd_pipeline_ngram.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_sgd_ngram == DataPrep.test_news['Label'])

        #
        ##random forest classifier
        #random_forest_ngram = Pipeline([
        #       ('rf_tfidf',FeatureSelection.tfidf_ngram),
        #       ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3))
        #       ])
        #random_forest_ngram.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        #predicted_rf_ngram = random_forest_ngram.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_rf_ngram == DataPrep.test_news['Label'])
        #
        #
        ##K-fold cross validation for all classifiers
        # build_confusion_matrix(nb_pipeline_ngram)
        # build_confusion_matrix(logR_pipeline_ngram)
        #build_confusion_matrix(svm_pipeline_ngram)
        #build_confusion_matrix(sgd_pipeline_ngram)
        #build_confusion_matrix(random_forest_ngram)
        ##
        ##========================================================================================
        ##n-grams & tfidf confusion matrix and F1 scores

        ##Naive bayes
        ## [841 3647]
        ## [427 5325]
        ## f1-Score: 0.723262051071

        ##Logistic regression
        ## [1617 2871]
        ## [1097 4655]
        ## f1-Score: 0.70113000531

        ##svm
        ## [2016 2472]
        ## [1524 4228]
        ## f1-Score: 0.67909201429

        ##sgdclassifier
        ## [  10 4478]
        ## [  13 5739]
        ## f1-Score: 0.718731637053

        ##random forest
        ## [1979 2509]
        ## [1630 4122]
        ## f1-Score: 0.665720333284
        ##=========================================================================================

        #print(classification_report(DataPrep.test_news['Label'], predicted_nb_ngram))
        # print(classification_report(DataPrep.test_news['Label'], predicted_LogR_ngram))
        #print(classification_report(DataPrep.test_news['Label'], predicted_svm_ngram))
        #print(classification_report(DataPrep.test_news['Label'], predicted_sgd_ngram))
        #print(classification_report(DataPrep.test_news['Label'], predicted_rf_ngram))
        ##
        ##DataPrep.test_news['Label'].shape
        ##
        ##"""
        ##Out of all the models fitted, we would take 2 best performing model. we would call them candidate models
        ##from the confusion matrix, we can see that random forest and logistic regression are best performing 
        ##in terms of precision and recall (take a look into false positive and true negative counts which appeares
        ##to be low compared to rest of the models)
        ##"""
        ##
        ###grid-search parameter optimization
        ###random forest classifier parameters
        #parameters = {'rf_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
        #               'rf_tfidf__use_idf': (True, False),
        #               'rf_clf__max_depth': (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
        #}
        #
        #gs_clf = GridSearchCV(random_forest_ngram, parameters, n_jobs=-1)
        #gs_clf = gs_clf.fit(DataPrep.train_news['Statement'][:10000],DataPrep.train_news['Label'][:10000])
        #
        #gs_clf.best_score_
        #gs_clf.best_params_
        #gs_clf.cv_results_
        print("Start of Log parameters")
        ###logistic regression parameters
        #parameters = {'LogR_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
        #               'LogR_tfidf__use_idf': (True, False),
        #               'LogR_tfidf__smooth_idf': (True, False)
        #}
        #
        #gs_clf = GridSearchCV(logR_pipeline_ngram, parameters, n_jobs=-1)
        #gs_clf = gs_clf.fit(DataPrep.test_news['Statement'][:10000],DataPrep.test_news['Label'][:10000])
        #
        #gs_clf.best_score_
        #gs_clf.best_params_
        #gs_clf.cv_results_
        #print("end of Log Parameters")
        #
        ###Linear SVM 
        #parameters = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(1,5)],
        #               'svm_tfidf__use_idf': (True, False),
        #               'svm_tfidf__smooth_idf': (True, False),
        #               'svm_clf__penalty': ('l1','l2'),
        #}
        #
        #gs_clf = GridSearchCV(svm_pipeline_ngram, parameters, n_jobs=-1)
        #gs_clf = gs_clf.fit(DataPrep.test_news['Statement'][:10000],DataPrep.test_news['Label'][:10000])
        #
        #gs_clf.best_score_
        #gs_clf.best_params_
        #gs_clf.cv_results_

        ###by running above commands we can find the model with best performing parameters
        ##
        ##
        ###running both random forest and logistic regression models again with best parameter found with GridSearch method
        #random_forest_final = Pipeline([
        #        ('rf_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)),
        #        ('rf_clf',RandomForestClassifier(n_estimators=300,n_jobs=3,max_depth=10))
        #        ])
        #    
        #random_forest_final.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_rf_final = random_forest_final.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_rf_final == DataPrep.test_news['Label'])
        #build_confusion_matrix(random_forest_final)
        #print("final done")
        ##print(metrics.classification_report(DataPrep.test_news['Label'], predicted_rf_final))
        ##
        #sgd_pipeline_final = Pipeline([
        #         ('sgd_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,3),use_idf=True,smooth_idf=True)),
        #         ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
        #         ])
        #sgd_pipeline_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        #predicted_sgd_final = sgd_pipeline_ngram.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_sgd_final == DataPrep.test_news['Label'])
        #build_confusion_matrix(sgd_pipeline_final)
        #print("start of Log best performing parameters")
        #logR_pipeline_final = Pipeline([
        #        #('LogRCV',countV_ngram),
        #        ('LogR_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)),
        #        ('LogR_clf',LogisticRegression(penalty="l2",C=1))
        #        ])
        #
        #logR_pipeline_final.fit(DataPrep.train_news['Statement'],DataPrep.train_news['Label'])
        #predicted_LogR_final = logR_pipeline_final.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_LogR_final == DataPrep.test_news['Label'])
        #build_confusion_matrix(logR_pipeline_final)

        ##accuracy = 0.62
        #print(metrics.classification_report(DataPrep.test_news['Label'], predicted_LogR_final))

        #sgd_pipeline_final = Pipeline([
        #         ('sgd_tfidf',TfidfVectorizer(stop_words='english',ngram_range=(1,5),use_idf=True,smooth_idf=False)),
        #         ('sgd_clf',SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5))
        #         ])
        #
        #sgd_pipeline_final.fit(DataPrep.test_news['Statement'],DataPrep.test_news['Label'])
        #predicted_sgd_final = sgd_pipeline_final.predict(DataPrep.test_news['Statement'])
        #np.mean(predicted_sgd_final == DataPrep.test_news['Label'])
        #build_confusion_matrix(sgd_pipeline_final)
        #print("End of best performing parameters")
        print("End of log_R")

        
Beispiel #10
0
def to_deepspeech(filepath):
    
    model_file_path = 'deepspeech-0.9.3-models.pbmm'
    scorer_path = 'deepspeech-0.9.3-models.scorer'
    ds = deepspeech.Model(model_file_path)
    ds.enableExternalScorer(scorer_path)
    
    base_directory = os.getcwd() #return current directory
    output_directory = os.path.join(base_directory,"output")
    audio_directory = os.path.join(base_directory,"temp")#save temp audio segment
    #output text file
    audio_file_name = filepath.split(os.sep)[-1].split(".")[0]
    filename = ntpath.basename(filepath).split(".")[0]
    txt_file_name = os.path.join(output_directory,filename+"_temp1.txt")
    file_handle = open(txt_file_name,"w+")
    
    #split silent parts in audio file
    segmentAudio.silenceRemoval(filepath,audio_directory)  
    
    print("Running interface:")
    for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))):
        audio_segment_path = os.path.join(audio_directory,file)
        ds_process_audio(ds, audio_segment_path,file_handle)
    
    file_handle.close()
    
    print("\nSpeech-to-Text Conversion Complete, restoring Punctuation...")
    
    # Running commands to punctuate deepspeech output
    punct2_script = os.path.join(base_directory,"punctutator2_theano\punctuator2-master\punctuator.py")
    punct2_model = os.path.join(base_directory,"punctutator2_theano\models\INTERSPEECH-T-BRNN.pcl")
    punct_output = os.path.join(output_directory,filename+"_temp2.txt")
    final_output = os.path.join(output_directory,filename+".txt")
    
    subprocess.run(["python2",punct2_script,punct2_model,punct_output,txt_file_name]) #punct_output_txt
    correct_output(punct_output,final_output)
    
    
    print("\nFinished! Output is",final_output,"\n") #final_output_txt
    print("Summarization starts...\n")
    
    
    # deleting original, unpuncutatued deepspeech output
    if os.path.exists(txt_file_name):
      os.remove(txt_file_name)
    else:
      print("The file does not exist")
    
    if os.path.exists(punct_output):
      os.remove(punct_output)
    else:
      print("The file does not exist")
        
    
    # Text Summarization
    LANGUAGE = "english"
    f = open(final_output,'r')
    lines = f.read().find(".")
    #print(lines,"\n")
    f.close()
    
    SENTENCES_COUNT = int(lines/4) #Quarter of the content
    # print(SENTENCES_COUNT,"\n")

    parser = PlaintextParser.from_file(final_output, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    Final = os.path.join(output_directory,filename+"_Final.txt")
    F = open(Final,"w")
    
    print("Your output will be shown below and saved in the output directory.\n")
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)
        print(sentence, file=F)
    
    F.close()
    print("\nEverything is done!")
    
    ##clean directory and temp file
    tmp_wav = os.path.join(base_directory,filename + ".wav")
    if os.path.exists(tmp_wav):
        os.remove(tmp_wav)
        
    shutil.rmtree(audio_directory)
    os.mkdir(audio_directory)
Beispiel #11
0
def scrape(request):
	if request.method == 'POST':
		
		y = json.loads(request.body)
		url = y.get("url", None)
		print(url)

		driver = webdriver.PhantomJS(executable_path='../phantomjs/bin/phantomjs')
		driver.get(url)
		el=driver.find_element_by_tag_name("body")
		textContent=el.text
		driver.close()

		imageSourceUrls = imageDB.objects.values_list('sourceUrl', flat=True)
		imageSourceUrls = list(imageSourceUrls)

		textSourceUrls = textDB.objects.values_list('sourceUrl', flat=True)
		textSourceUrls = list(textSourceUrls)

		summary = textContent
		if url not in textSourceUrls or url not in imageSourceUrls:
			LANGUAGE = "english"
			SENTENCES_COUNT = 10

			# parser = PlaintextParser.from_string(textContent,Tokenizer("english"))
			# summarizer = LuhnSummarizer()
			# summary = ''

			# for sentence in summarizer(parser.document, SENTENCES_COUNT):
			# 	summary = summary + str(sentence)
			

			# print("Summary ",summary)

			
			parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
			stemmer = Stemmer(LANGUAGE)
			summarizer = Summarizer(stemmer)
			summarizer.stop_words = get_stop_words(LANGUAGE)

			summaryText = ""
			for sentence in summarizer(parser.document, SENTENCES_COUNT):
				summaryText = summaryText + str(sentence)
			
			r = Request(url,headers={'User-Agent': 'Mozilla/5.0'})
			x = urlopen(r)
			codebase = BeautifulSoup(x, 'html.parser')
			title = codebase.title.string
			if not title:
				domain = urlparse(url)
				title = domain.hostname
			print(title)
			iconLink = codebase.find("link", rel="shortcut icon")
			if not iconLink:
				iconLink = ' '
			else:
				iconLink = urljoin(url, iconLink.get('href'))

			textDB.objects.create(summaryText=summaryText, summary=summary, dateTime=timezone.now(), sourceUrl=url, title=title, icon=iconLink)

			scraper = Scraper()
			scraper.scrape(url)
		else:
			textDB.objects.filter(sourceUrl=url).delete()
			imageDB.objects.filter(sourceUrl=url).delete()
			print("DELETED")
			LANGUAGE = "english"
			SENTENCES_COUNT = 10

			parser = PlaintextParser.from_string(textContent,Tokenizer("english"))
			summarizer = LuhnSummarizer()

			summary = ''

			for sentence in summarizer(parser.document, SENTENCES_COUNT):
				summary = summary + str(sentence)

			parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
			stemmer = Stemmer(LANGUAGE)
			summarizer = Summarizer(stemmer)
			summarizer.stop_words = get_stop_words(LANGUAGE)

			summaryText = ""
			for sentence in summarizer(parser.document, SENTENCES_COUNT):
				summaryText = summaryText + str(sentence)

			r = Request(url,headers={'User-Agent': 'Mozilla/5.0'})
			x = urlopen(r)
			codebase = BeautifulSoup(x, 'html.parser')
			title = codebase.title.string
			iconLink = codebase.find("link", rel="shortcut icon")
			if not iconLink:
				iconLink = ' '
			else:
				iconLink = urljoin(url, iconLink.get('href'))

			textDB.objects.create(summaryText=summaryText, summary=summary, dateTime=timezone.now(), sourceUrl=url, title=title, icon=iconLink)

			scraper = Scraper()
			scraper.scrape(url)


		return HttpResponse("Successful")