Esempio n. 1
0
def summarize(url_topull, num_of_words):
    # Obtain text
    scraped_data = urllib.request.urlopen(url_topull)  
    article = scraped_data.read()
    
    parsed_article = bs.BeautifulSoup(article,'lxml')
    paragraphs = parsed_article.find_all('p')
    article_text = ""
    for p in paragraphs:  
        article_text += p.text

    # Extract keywords
    stop_words = set(stopwords.words('english')) 
    keywords = mz_keywords(article_text,scores=True,threshold=0.003)
    keywords_names = []
    for tuples in keywords:
        if tuples[0] not in stop_words: 
            if len(tuples[0]) > 2:
                keywords_names.append(tuples[0])

    
    pre_summary = su_gs(article_text,word_count=num_of_words)
    
    summary = re.sub("[\(\[].*?[\)\]]", "", pre_summary)
    
    print_pretty (summary,keywords_names)
Esempio n. 2
0
    def test_mz_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f:
            text = utils.to_unicode(f.read())
        text = u' '.join(text.split()[:10240])
        kwds = mz_keywords(text)
        self.assertTrue(kwds.startswith('autism'))
        self.assertTrue(kwds.endswith('uk'))
        self.assertTrue(len(kwds.splitlines()))

        kwds_lst = mz_keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
        # Automatic thresholding selects words with n_blocks / n_blocks+1
        # bits of entropy. For this text, n_blocks=10
        n_blocks = 10.
        kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto')
        self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
Esempio n. 3
0
    def test_mz_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path,
                                           "head500.noblanks.cor")) as f:
            text = utils.to_unicode(f.read())
        text = u' '.join(text.split()[:10240])
        kwds = mz_keywords(text)
        self.assertTrue(kwds.startswith('autism'))
        self.assertTrue(kwds.endswith('uk'))
        self.assertTrue(len(kwds.splitlines()))

        kwds_lst = mz_keywords(text, split=True)
        self.assertTrue(len(kwds_lst))
        # Automatic thresholding selects words with n_blocks / n_blocks+1
        # bits of entropy. For this text, n_blocks=10
        n_blocks = 10.
        kwds_auto = mz_keywords(text,
                                scores=True,
                                weighted=False,
                                threshold='auto')
        self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
Esempio n. 4
0
def summarize(article_text, num_of_sentences):
    # Extract keywords
    stop_words = set(stopwords.words('english'))
    keywords = mz_keywords(article_text, scores=True, threshold=0.003)
    keywords_names = []
    for tuples in keywords:
        if tuples[0] not in stop_words:
            if len(tuples[0]) > 2:
                keywords_names.append(tuples[0])

    pre_summary, rank_sum = tr.textrank_summarise(article_text,
                                                  num_of_sentences)

    summary = re.sub("[\(\[].*?[\)\]♪]", "", pre_summary)

    print_pretty(summary, keywords_names)

    return summary, rank_sum
Esempio n. 5
0
topic_water, topic_finance = model.show_topics()
finance_distribution = make_topics_bow(topic_finance[1])
water_distribution = make_topics_bow(topic_water[1])

hellinger(water_distribution, finance_distribution)

from gensim import similarities

index = similarities.MatrixSimilarity(model[corpus])
sims = index[lda_bow_finance]
print(list(enumerate(sims)))

sims = sorted(enumerate(sims), key=lambda item: -item[1])

for doc_id, similarity in sims:
    print texts[doc_id], similarity

from gensim.summarization import summarize
print (summarize(text))

print (summarize(text, word_count=50))

from gensim.summarization import keywords

print (keywords(text))

from gensim.summarization import mz_keywords
mz_keywords(text,scores=True,weighted=False,threshold=1.0)

Esempio n. 6
0
    def run(self, query='', location='New York, NY'):
        "Searching for job postings."
        q = query #ideal job
        l = location #location of job
        numPage = 3 #num pages to scrap links from
        allLinks = [] # list to capture
        start = 0 #pagnigation variable, page 1 = 0, page 2 = 10, page 3 = 30, etc

        # loop over n number of pages
        for page_result in range(numPage):
            start = page_result* 10 #increment the variable used to denote the next page
            search_result_url = 'https://www.indeed.com/jobs?q='+ q +'&l='+ l +'&start='+str(start) #build query string
            if self.verbose==True:
                print(search_result_url,end="\r", flush=True)
            jobSearchResult = self.scrape_search_result_page(search_result_url,page_result, self.browser) # call scraper function
            allLinks.extend(jobSearchResult) #add to link
        #Remove Duplicates
        if self.verbose==True:
            print(len(allLinks))
        allLinks = list(set(allLinks))
        if self.verbose==True:
            print (len(allLinks))


        #print(allLinks)
        job_urls_file = 'jobSearchResult-' +q+'.txt'
        # write to file
        self.write_lst(allLinks,job_urls_file)
        homepage_found = False
        page_data = ''
        page_data_list = []
        print("Scraping " +str(len(allLinks)) + " job links.")
        incrementOp=math.ceil(len(allLinks)/10)
        counter=incrementOp
        for link_num, indeed_url in enumerate(allLinks):
            # if self.verbose==True:
            print("Accessing link",link_num+1,"of",len(allLinks),'      ',end="\r", flush=True)
            if counter<1:
                print("*", end='')
                counter=incrementOp
            counter-=1
            try:
                page_soup = self.remove_script(self.get_js_soup(indeed_url,self.browser))

            except:
                print ('Could not access {}'.format(indeed_url))

            page_data = self.process_bio(page_soup.get_text(separator=' '))  #helper function from bio hw to clean up text

            #remove header
            page_data = page_data[189:] #the 189 slice removes the header of the indeed pages

            #remove footer
            footer_position = page_data.find('save job') #find the position of 'save job' which starts the footer
            trimStringBy = footer_position - len(page_data) #returns a negative number to trim the string by
            page_data = page_data[:trimStringBy] #drop footer
            page_data = remove_stopwords(page_data)
            page_data_list.append(page_data)
        print("*-> Scrape Complete.")
        print("Summarizing Findings.")
        # if self.verbose==True:
        #     print(page_data_list[1])
        document_set = page_data_list
        page_data_file = 'pageText' +q+'.txt'
        self.write_lst(page_data_list,page_data_file)

        # Create single document by concatenating all documents
        all_documents = ""
        for doc in page_data_list:
            all_documents += doc

        self.alldocs=all_documents

        #Creating keywords
        keywords(all_documents).split('\n')
        self.summary=summarize(all_documents, word_count  = 250,split=True)
        self.keywords=mz_keywords(all_documents,scores=True,threshold=0.001)
        if self.verbose==True:
            print (self.summary)
            print (self.keywords)

        # # Topic Modeling
        # ### tokenize the documents
        docs = page_data_list

        # Split the documents into tokens.
        tokenizer = RegexpTokenizer(r'\w+')
        for idx in range(len(docs)):
            docs[idx] = docs[idx].lower()  # Convert to lowercase.
            docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

        # Remove numbers, but not words that contain numbers.
        docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

        # Remove words that are less than 3 characters.
        docs = [[token for token in doc if len(token) > 3] for doc in docs]


        # must download wordnet!!!
        # nltk.download('wordnet')

        # ### lemmatize the documents
        lemmatizer = WordNetLemmatizer()
        docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]


        # ### compute bigrams
        # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(docs, min_count=10)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)


        # ### remove rare and common tokens
        # Create a dictionary representation of the documents.
        dictionary = Dictionary(docs)
        # Filter out words that occur less than 20 documents, or more than 50% of the documents.
        dictionary.filter_extremes(no_below=20, no_above=0.75)
        # Bag-of-words representation of the documents.
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        
        print('Number of unique tokens: %d' % len(dictionary))
        print('Number of documents: %d' % len(corpus))
        
        # ## Build LDA Model
        # Set training parameters.
        num_topics = self.num_topics
        chunksize = 2000
        passes = 20
        iterations = 400
        eval_every = None  # Don't evaluate model perplexity, takes too much time.

        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        # store into object
        self.corpus = corpus
        self.dictionary = dictionary
        self.id2word = id2word

        lda_model = LdaModel(
            corpus=corpus,
            id2word=id2word,
            chunksize=chunksize,
            alpha='auto',
            eta='auto',
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=eval_every
        )

        # top_topics = lda_model.top_topics(self.corpus, topn=10) #, num_words=10)
        # # print(top_topics)
        # # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        # avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        # if self.verbose==True:
        #     print('Average topic coherence: %.4f.' % avg_topic_coherence)

        #     for idx, topic in lda_model.print_topics(-1):
        #         print('Topic: {} \nWords: {}'.format(idx, topic))
        
        # store into object
        self.lda=lda_model

        # ### PCA
        lda_topic_coverage = []
        if self.verbose==True:
            print(lda_model[corpus])
        for i, row_list in enumerate(lda_model[corpus]):
            # row list contains list of topic number and probability of topic as a tuple
            # note that each doc can have multiple topics

            # initialize zero list
            r = np.zeros(num_topics)

            topic_n, p_topic =zip(*row_list)

            # store topic prob into r
            for i in range(len(row_list)):
                r[topic_n[i]]=p_topic[i]
            lda_topic_coverage.append(r)


        # Array of topic weights
        self.lda_topic_coverage = pd.DataFrame(lda_topic_coverage).fillna(0).values

        # only look at the first two components
        pca = PCA(n_components=2)
        result = pca.fit_transform(self.lda_topic_coverage)

        self.pca=result

        # Dominant topic number in each doc
        topic_num = np.argmax(self.lda_topic_coverage, axis=1)


        # # tSNE Dimension Reduction
        # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
        # tsne_lda = tsne_model.fit_transform(self.lda_topic_coverage)


            
Esempio n. 7
0
# Montemurro and Zanette's entropy based keyword extraction algorithm
# -------------------------------------------------------------------
#
# `This paper <https://arxiv.org/abs/0907.1558>`__ describes a technique to
# identify words that play a significant role in the large-scale structure of a
# text. These typically correspond to the major themes of the text. The text is
# divided into blocks of ~1000 words, and the entropy of each word's
# distribution amongst the blocks is caclulated and compared with the expected
# entropy if the word were distributed randomly.
#

import requests
from gensim.summarization import mz_keywords

text = requests.get("http://www.gutenberg.org/files/49679/49679-0.txt").text
print(mz_keywords(text, scores=True, threshold=0.001))

###############################################################################
# By default, the algorithm weights the entropy by the overall frequency of the
# word in the document. We can remove this weighting by setting weighted=False
#
print(mz_keywords(text, scores=True, weighted=False, threshold=1.0))

###############################################################################
# When this option is used, it is possible to calculate a threshold
# automatically from the number of blocks
#
print(mz_keywords(text, scores=True, weighted=False, threshold="auto"))

###############################################################################
# The complexity of the algorithm is **O**\ (\ *Nw*\ ), where *N* is the number