def main(argv): if len(argv) > 1: htmlist = argv[1] else: htmlist = 'htmlist' # Our permanent config for html cleaning config = Config() config.language = 'id' config.MIN_SENT_COUNT = 20 config.memoize = False config.fetch_images = False config.verbose= True cleaner = Article(url='', config=config) with open(htmlist, 'r') as f: htmfile = f.read().split('\n') raw = [] for htm in htmfile: print (htm) if not htm.endswith("rss.html"): with open(htm, 'r') as f: h = f.read() cleaner.set_html(h) cleaner.parse() sentences = nlp.split_sentences(cleaner.text) #raw.append(sentences]) with open('htm-out', 'a') as f: [f.write(r + '\n') for r in sentences]
def parse_article(self, response): # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() item = Art() item['title'] = article.title item['url'] = article.url item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' '))) yield item
def parse_article(self, response): # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() item = Art() item["title"] = article.title item["url"] = article.url item["text"] = "\n".join(nlp.split_sentences(article.text.replace("\n", " "))) yield item
def _new_summarize(text='', max_sents=5): summaries = [] sentences = split_sentences(text) keys = keywords(text) # Score sentences, and use the top 5 or max_sents sentences ranks = nlp.score(sentences, keys).most_common(max_sents) for rank in ranks: summaries.append(rank[0]) summaries.sort(key=lambda summary: summary[0]) return [summary[1] for summary in summaries]
def _new_summarize( text='', max_sents=5): summaries = [] sentences = split_sentences(text) keys = keywords(text) # Score sentences, and use the top 5 or max_sents sentences ranks = nlp.score(sentences, keys).most_common(max_sents) for rank in ranks: summaries.append(rank[0]) summaries.sort(key=lambda summary: summary[0]) return [summary[1] for summary in summaries]
def parse_article(self, response): if len(response.body) > 0: # utilize newspaper for article parsing article = Article(url=response.url, config=self.config) article.set_html(response.body) article.parse() #self.sentences.append(nlp.split_sentences(article.text)) item = Art() item['title'] = article.title item['url'] = article.url item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' '))) yield item else: print response.url + ' DEAD LINK'
def summarize(self, html, percent_sentences): if (percent_sentences is None or percent_sentences > 100 or percent_sentences < 0): percent_sentences = 15 article = self.process_html(html) # remove title from the text, if it appears in the text if article.text.startswith(article.title): article.set_text(article.text[len(article.title):]) sentences = nlp.split_sentences(article.text) log.debug(article.text) # remove punctuations, numbers and special characters clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") clean_sentences = [s.lower() for s in clean_sentences] clean_sentences = [ self._remove_stopwords(r.split()) for r in clean_sentences ] # create sentence vectors sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum([ self.word_embeddings.get(w, np.zeros((300, ))) for w in i.split() ]) / (len(i.split()) + 0.001) else: v = np.zeros((300, )) sentence_vectors.append(v) # similarity matrix sim_mat = np.zeros([len(sentences), len(sentences)]) # initialize matrix for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 300), sentence_vectors[j].reshape(1, 300), )[0, 0] # convert matrix into graph nx_graph = nx.from_numpy_array(sim_mat) textrank_scores = self.normalize_scores(nx.pagerank(nx_graph)) # get newspaper's nlp scores # https://github.com/codelucas/newspaper/blob/master/newspaper/article.py#L372 nlp.load_stopwords(article.config.get_language()) # call to: nlp.summarize(title=article.title, text=article.text, max_sents=max_sents) # https://github.com/codelucas/newspaper/blob/master/newspaper/nlp.py#L40 title_words = nlp.split_words(article.title) most_frequent = nlp.keywords(article.text) nlp_scores = self.normalize_scores( nlp.score(sentences, title_words, most_frequent)) totalled_scores = Counter() for key, value in nlp_scores.items(): totalled_scores[key[0]] += value for key, value in textrank_scores.items(): totalled_scores[key] += value num_sentences = int(len(clean_sentences) * percent_sentences / 100) sentence_indices = list( map(lambda x: x[0], totalled_scores.most_common(num_sentences))) return list(map(lambda x: sentences[x], sentence_indices))