def summarize(url_topull, num_of_words): # Obtain text scraped_data = urllib.request.urlopen(url_topull) article = scraped_data.read() parsed_article = bs.BeautifulSoup(article,'lxml') paragraphs = parsed_article.find_all('p') article_text = "" for p in paragraphs: article_text += p.text # Extract keywords stop_words = set(stopwords.words('english')) keywords = mz_keywords(article_text,scores=True,threshold=0.003) keywords_names = [] for tuples in keywords: if tuples[0] not in stop_words: if len(tuples[0]) > 2: keywords_names.append(tuples[0]) pre_summary = su_gs(article_text,word_count=num_of_words) summary = re.sub("[\(\[].*?[\)\]]", "", pre_summary) print_pretty (summary,keywords_names)
def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: text = utils.to_unicode(f.read()) text = u' '.join(text.split()[:10240]) kwds = mz_keywords(text) self.assertTrue(kwds.startswith('autism')) self.assertTrue(kwds.endswith('uk')) self.assertTrue(len(kwds.splitlines())) kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) # Automatic thresholding selects words with n_blocks / n_blocks+1 # bits of entropy. For this text, n_blocks=10 n_blocks = 10. kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.)))
def summarize(article_text, num_of_sentences): # Extract keywords stop_words = set(stopwords.words('english')) keywords = mz_keywords(article_text, scores=True, threshold=0.003) keywords_names = [] for tuples in keywords: if tuples[0] not in stop_words: if len(tuples[0]) > 2: keywords_names.append(tuples[0]) pre_summary, rank_sum = tr.textrank_summarise(article_text, num_of_sentences) summary = re.sub("[\(\[].*?[\)\]♪]", "", pre_summary) print_pretty(summary, keywords_names) return summary, rank_sum
topic_water, topic_finance = model.show_topics() finance_distribution = make_topics_bow(topic_finance[1]) water_distribution = make_topics_bow(topic_water[1]) hellinger(water_distribution, finance_distribution) from gensim import similarities index = similarities.MatrixSimilarity(model[corpus]) sims = index[lda_bow_finance] print(list(enumerate(sims))) sims = sorted(enumerate(sims), key=lambda item: -item[1]) for doc_id, similarity in sims: print texts[doc_id], similarity from gensim.summarization import summarize print (summarize(text)) print (summarize(text, word_count=50)) from gensim.summarization import keywords print (keywords(text)) from gensim.summarization import mz_keywords mz_keywords(text,scores=True,weighted=False,threshold=1.0)
def run(self, query='', location='New York, NY'): "Searching for job postings." q = query #ideal job l = location #location of job numPage = 3 #num pages to scrap links from allLinks = [] # list to capture start = 0 #pagnigation variable, page 1 = 0, page 2 = 10, page 3 = 30, etc # loop over n number of pages for page_result in range(numPage): start = page_result* 10 #increment the variable used to denote the next page search_result_url = 'https://www.indeed.com/jobs?q='+ q +'&l='+ l +'&start='+str(start) #build query string if self.verbose==True: print(search_result_url,end="\r", flush=True) jobSearchResult = self.scrape_search_result_page(search_result_url,page_result, self.browser) # call scraper function allLinks.extend(jobSearchResult) #add to link #Remove Duplicates if self.verbose==True: print(len(allLinks)) allLinks = list(set(allLinks)) if self.verbose==True: print (len(allLinks)) #print(allLinks) job_urls_file = 'jobSearchResult-' +q+'.txt' # write to file self.write_lst(allLinks,job_urls_file) homepage_found = False page_data = '' page_data_list = [] print("Scraping " +str(len(allLinks)) + " job links.") incrementOp=math.ceil(len(allLinks)/10) counter=incrementOp for link_num, indeed_url in enumerate(allLinks): # if self.verbose==True: print("Accessing link",link_num+1,"of",len(allLinks),' ',end="\r", flush=True) if counter<1: print("*", end='') counter=incrementOp counter-=1 try: page_soup = self.remove_script(self.get_js_soup(indeed_url,self.browser)) except: print ('Could not access {}'.format(indeed_url)) page_data = self.process_bio(page_soup.get_text(separator=' ')) #helper function from bio hw to clean up text #remove header page_data = page_data[189:] #the 189 slice removes the header of the indeed pages #remove footer footer_position = page_data.find('save job') #find the position of 'save job' which starts the footer trimStringBy = footer_position - len(page_data) #returns a negative number to trim the string by page_data = page_data[:trimStringBy] #drop footer page_data = remove_stopwords(page_data) page_data_list.append(page_data) print("*-> Scrape Complete.") print("Summarizing Findings.") # if self.verbose==True: # print(page_data_list[1]) document_set = page_data_list page_data_file = 'pageText' +q+'.txt' self.write_lst(page_data_list,page_data_file) # Create single document by concatenating all documents all_documents = "" for doc in page_data_list: all_documents += doc self.alldocs=all_documents #Creating keywords keywords(all_documents).split('\n') self.summary=summarize(all_documents, word_count = 250,split=True) self.keywords=mz_keywords(all_documents,scores=True,threshold=0.001) if self.verbose==True: print (self.summary) print (self.keywords) # # Topic Modeling # ### tokenize the documents docs = page_data_list # Split the documents into tokens. tokenizer = RegexpTokenizer(r'\w+') for idx in range(len(docs)): docs[idx] = docs[idx].lower() # Convert to lowercase. docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words. # Remove numbers, but not words that contain numbers. docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are less than 3 characters. docs = [[token for token in doc if len(token) > 3] for doc in docs] # must download wordnet!!! # nltk.download('wordnet') # ### lemmatize the documents lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] # ### compute bigrams # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=10) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # ### remove rare and common tokens # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.75) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) # ## Build LDA Model # Set training parameters. num_topics = self.num_topics chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token # store into object self.corpus = corpus self.dictionary = dictionary self.id2word = id2word lda_model = LdaModel( corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every ) # top_topics = lda_model.top_topics(self.corpus, topn=10) #, num_words=10) # # print(top_topics) # # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. # avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics # if self.verbose==True: # print('Average topic coherence: %.4f.' % avg_topic_coherence) # for idx, topic in lda_model.print_topics(-1): # print('Topic: {} \nWords: {}'.format(idx, topic)) # store into object self.lda=lda_model # ### PCA lda_topic_coverage = [] if self.verbose==True: print(lda_model[corpus]) for i, row_list in enumerate(lda_model[corpus]): # row list contains list of topic number and probability of topic as a tuple # note that each doc can have multiple topics # initialize zero list r = np.zeros(num_topics) topic_n, p_topic =zip(*row_list) # store topic prob into r for i in range(len(row_list)): r[topic_n[i]]=p_topic[i] lda_topic_coverage.append(r) # Array of topic weights self.lda_topic_coverage = pd.DataFrame(lda_topic_coverage).fillna(0).values # only look at the first two components pca = PCA(n_components=2) result = pca.fit_transform(self.lda_topic_coverage) self.pca=result # Dominant topic number in each doc topic_num = np.argmax(self.lda_topic_coverage, axis=1) # # tSNE Dimension Reduction # tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') # tsne_lda = tsne_model.fit_transform(self.lda_topic_coverage)
# Montemurro and Zanette's entropy based keyword extraction algorithm # ------------------------------------------------------------------- # # `This paper <https://arxiv.org/abs/0907.1558>`__ describes a technique to # identify words that play a significant role in the large-scale structure of a # text. These typically correspond to the major themes of the text. The text is # divided into blocks of ~1000 words, and the entropy of each word's # distribution amongst the blocks is caclulated and compared with the expected # entropy if the word were distributed randomly. # import requests from gensim.summarization import mz_keywords text = requests.get("http://www.gutenberg.org/files/49679/49679-0.txt").text print(mz_keywords(text, scores=True, threshold=0.001)) ############################################################################### # By default, the algorithm weights the entropy by the overall frequency of the # word in the document. We can remove this weighting by setting weighted=False # print(mz_keywords(text, scores=True, weighted=False, threshold=1.0)) ############################################################################### # When this option is used, it is possible to calculate a threshold # automatically from the number of blocks # print(mz_keywords(text, scores=True, weighted=False, threshold="auto")) ############################################################################### # The complexity of the algorithm is **O**\ (\ *Nw*\ ), where *N* is the number