def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def get_doc_summary(html, url): ''' Parse document text and extract summary with summarization algorithms. This is helpful when meta-desc tag is not available ''' from sumy.parsers.html import HtmlParser # from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 3 parser = HtmlParser.from_string(html, url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) res = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): res += str(sentence) return res
def auto_summarize_comment(request): comment_ids = request.POST.getlist('d_ids[]') sent_list = [] for comment_id in comment_ids: comment = Comment.objects.get(id=comment_id) text = comment.text text = re.sub('<br>', ' ', text) text = re.sub('<BR>', ' ', text) parser = HtmlParser.from_string(text, '', Tokenizer("english")) num_sents = request.GET.get('num_sents', None) if not num_sents: all_sents = parser.tokenize_sentences(text) num_sents = floor(float(len(all_sents))/3.0) sents = summarizer(parser.document, num_sents) for sent in sents: sent_list.append(sent._text) return JsonResponse({"sents": sent_list})
def get_summary(self, summary_length: int = 10) -> Iterator[str]: parser = HtmlParser.from_string(self.content, Tokenizer(LANGUAGE), self.link) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, summary_length): yield sentence
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def get_summary(html): parser = HtmlParser.from_string(html, tokenizer=Tokenizer(LANGUAGE), url=None) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def summarize(doc, SENTENCES_COUNT): parser = HtmlParser.from_string(doc, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): if str(sentence).strip().startswith("Image copyright") is False: summary += (" " + str(sentence)) return summary
def summarize(self, summarizer_type, max_sentences): # TextRank if summarizer_type == "textrank": self.result_list = summarize(self.document, ratio=0.3, word_count=None, split=True) #PyTextRank elif summarizer_type == "lsa": parser = HtmlParser.from_string(self.document, None,tokenizer=Tokenizer("english")) stemmer = Stemmer("english") summarizer = summarizers.lsa.LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") summarized_sentence_list = summarizer(parser.document, max_sentences) self.result_list = [str(sentence) for sentence in summarized_sentence_list]
def summarize(method, length, url): html_content = fetch_url(url) iso_lang = detect_language(html_content) language = SUMY_LANGUAGES[iso_lang] stemmer = Stemmer(language) parser = HtmlParser.from_string(html_content, url, Tokenizer(language)) summarizer_class = AVAILABLE_METHODS[method] summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser) sentences = summarizer(parser.document, ItemsCount(length)) summary = ' '.join([unicode(sentence) for sentence in sentences]) return summary, iso_lang
def do(): rows = store.get_row_by_status(1) for row in rows: parser = HtmlParser.from_string(row["content_origin"], row["url"], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = list() for sentence in summarizer(parser.document, SENTENCES_COUNT): sentences.append(str(sentence)) summary = "\n".join(sentences) store.update_row(row["id"], {"summary_origin": summary, "status": 2})
def get_summary(xhtml): summary_algorithm = TextSummarizer LANGUAGE = "english" REVIEW_COUNT = 20 # SENTENCES_COUNT = 30 parser = HtmlParser.from_string(xhtml, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = summary_algorithm(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summaries = [] for sentence in summarizer(parser.document, REVIEW_COUNT): sentence = str(sentence).strip() if sentence not in summaries and '?' not in sentence: summaries.append(sentence) return summaries
def _get_summary(self): if self.readable == '': return language = self.language.lower() if language == '': language = 'english' parser = HtmlParser.from_string( self.readable, self.url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) summary = [] for sentence in summarizer(parser.document, 10): if sentence.is_heading: summary.append('<h2>%s</h2>' % (unicode(sentence))) else: summary.append('<p>%s</p>' % (unicode(sentence))) self.summary = ''.join(summary)
def summerize_text(text): text = text.replace("#", " ").replace("\n", " ") parser = HtmlParser.from_string(text, "https://www.topsocial.com", Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = summarizer(parser.document, SENTENCES_COUNT) return sentences[0] if len(sentences) > 0 else None # for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) # summerize_text("""با این حال در روزهاى پایانى فصل و با انتشار دوباره شایعه مذاکره اش با پرسپولیس، منصوریان رویه تازه اى براى این هافبک دفاعى در نظر گرفت و در اولین گام نامش را از لیست جدال با الاهلى خط زد. مساله اى که باعث شد تا او به حالت قهر در دو تمرین بعد از این بازى غایب باشد. منصوریان در سفر به مشهد و براى دیدار با پدیده هم بار دیگر نام این بازیکن را از لیست تیمش خط زد تا باقرى و جدایى از پیراهن استقلال به اپیزود پایانى خود نزدیک شود. # # از آنجایی که برانکو هم در پایان فصل پیش و هم در نقل و انتقالات نیم فصل علاقه خود به جذب این بازیکن را نشان داده بود و در نهایت ناکام مانده بود، حالا مرد کروات امیدوار شده تا سومین تیرش در جذب باقری به هدف بخورد و پیراهن پرسپولیس را به هافبک استقلال هدیه بدهد. اتفاقی که به زودی رخ خواهد داد. # """)
def auto_summarize_comment(request): from sumy.nlp.stemmers import Stemmer #from sumy.utils import get_stop_words from sumy.parsers.html import HtmlParser from sumy.nlp.tokenizers import Tokenizer #from sumy.summarizers.lsa import LsaSummarizer as Summarizer #from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer stemmer = Stemmer("english") summarizer = Summarizer(stemmer) comment_ids = request.POST.getlist('d_ids[]') sent_list = [] for comment_id in comment_ids: comment = Comment.objects.get(id=comment_id) text = comment.text text = re.sub('<br>', ' ', text) text = re.sub('<BR>', ' ', text) parser = HtmlParser.from_string(text, '', Tokenizer("english")) num_sents = request.GET.get('num_sents', None) if not num_sents: all_sents = parser.tokenize_sentences(text) num_sents = floor(float(len(all_sents))/3.0) sents = summarizer(parser.document, num_sents) for sent in sents: sent_list.append(sent._text) return JsonResponse({"sents": sent_list})
def summarize(self, summarizer_type, max_sentences, document = ""): if self.document == "": target_document = document else: target_document = self.document # Spacing _target_document = "" sentence_list = self.pro.sentence_splitter(target_document) for sentence in sentence_list: _target_document += sentence + " " _target_document = _target_document.strip() # TextRank if summarizer_type == "textrank": self.result_list = summarize(_target_document, ratio=0.3, word_count=None, split=True)[:max_sentences] # PyTextRank elif summarizer_type == "lsa": parser = HtmlParser.from_string(_target_document, None,tokenizer=Tokenizer("english")) stemmer = Stemmer("english") summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") summarized_sentence_list = summarizer(parser.document, max_sentences) self.result_list = [str(sentence) for sentence in summarized_sentence_list] return self.result_list
def get_summary(article, url=False, num_sentence=NUM_SUMMARY_SENTENCE): """ get the summary of one article :param num_sentence: number of sentence left for summary :param article: html string of the article or the url of the article :param url: True is article is an url :return: the summary of the article as string """ if url: parser = HtmlParser.from_url(article, tokenizer=Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_string(article, tokenizer=Tokenizer(LANGUAGE), url=None) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summ_sents = summarizer(parser.document, num_sentence) summary = " ".join([str(s).strip() for s in summ_sents]) return summary
def do_work(self, worker_id, work): url = work """Greenlet to fetch analyze URL content """ print '[+] {0}: Starting crawl of {1}'.format(worker_id, url) """Using urllib2 via geventhttpclient. Selenium with PhantomJS or a real browser would be probably better but slower and more expensive. Could have also used scrapy, but thats way to heavy for this use-case.""" body = urlopen(url).read() """Using Sumy (built on nltk) for page summaries since it supports a number of ranking algorithms. It's not perfect though, it was written for czech and so its missing some important English-specific things (e.g. bonus/significant words for Edmundson Summarizers) https://pypi.python.org/pypi/sumy TextBlob might be a better alternative, but it didn't seem to provide overall summary information. https://textblob.readthedocs.org/en/latest/ """ parser = HtmlParser.from_string(body, None, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) words = [] for sentence in summarizer(parser.document, 10): words = str(sentence).split() # Send the results self.work_done(worker_id, words)
from sumy.parsers.html import HtmlParser from sumy.nlp.tokenizers import Tokenizer import pkuseg import math LANGUAGE = "chinese" SENTENCES_COUNT = 3 if __name__ == "__main__": seg = pkuseg.pkuseg(postag=True) for i, line in enumerate(open("data/content_new_2.txt").readlines()): parser = HtmlParser.from_string(line, tokenizer=Tokenizer(LANGUAGE), url=None) sent_score = [] for j, sent in enumerate(parser.document.sentences): text = sent._text segtext = seg.cut(text) nnum = len(list(filter(lambda x: x[1] == 'n', segtext))) rnum = len(list(filter(lambda x: x[0] == '本院', segtext))) r2num = len(list(filter(lambda x: x[0] == '认为', segtext))) lnum = math.log(len(segtext), 10) sent_score.append( (text, (nnum + rnum * 7 + r2num * 5) * 1.0 / len(segtext) + lnum, j, segtext)) num_sentences = int(max(min(5, len(parser.document.sentences) / 15), 3)) sent_score.sort(key=lambda x: x[1], reverse=True) sent_idx = [(text, idx) for text, radio, idx, segtext in sent_score
CONFIG_FILE = environ["HOME"] + "/.cloudfeed" CONFIG = {'last_pub': 0} LANGUAGE = 'english' if path.exists(CONFIG_FILE): with open(CONFIG_FILE, mode="r") as f: CONFIG.update(json.load(f)) feed = CloudFeed(db=CONFIG['database']) mastodon = Mastodon(client_id=CONFIG['client_id'], client_secret=CONFIG['client_secret'], access_token=CONFIG['access_token'], api_base_url=CONFIG['mastodon_url']) summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) new_posts = feed.get_posts(since=CONFIG['last_pub']) if len(new_posts) > 0: CONFIG["last_pub"] = max([post["pub_date"] for post in new_posts]) for post in new_posts: summary = summarizer( HtmlParser.from_string(post['body'], post['url'], Tokenizer("english")).document, 1) post["summary"] = summary[0] message = "{feed}: {title}\n{url}\n\n{summary}".format(**post) mastodon.toot(message) with open(CONFIG_FILE, mode="w+") as f: json.dump(CONFIG, f)
def crawl(): print(datetime.now()) cursor = conn.cursor() cursor.execute("SET NAMES utf8mb4") cursor.execute('select id, name, feedUrl, lang, form from sources') sources = cursor.fetchall() start = time.clock() for source in sources: # if source['id']%30 == datetime.now().minute%30: print(source[0]) source = { 'id': source[0], 'name': source[1], 'feedUrl': source[2].replace("39.105.127.55", "127.0.0.1"), 'lang': source[3], 'form': source[4] } print(source['name']) LANGUAGE = 'chinese' if source['lang'] == 2: LANGUAGE = 'english' items = feedparser.parse(source['feedUrl'])['items'] for item in items: try: cursor.execute('select 1 from entries where link = %s limit 1', (item['link'], )) results = cursor.fetchall() if (not results) or (len(results) == 0): try: entry = { 'title': item['title'], 'link': item['link'], 'source_id': source['id'], 'source_name': source['name'], 'time': '', 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'photo': '', 'lang': 1, 'author': '', 'description': '', 'digest': '', 'content': '', 'cluster': 0, 'sim_count': 0, 'simhash': '0', 'cate11': '', 'cate12': '', 'cate13': '', 'cate21': '', 'cate22': '', 'cate23': '', 'tag1': '', 'tag2': '', 'tag3': '', 'tag4': '', 'tag5': '', 'video': '', 'video_frame': '', 'audio': '', 'audio_frame': '' } cate1 = ['', '', ''] cate2 = ['', '', ''] tag = ['', '', '', '', ''] ############ Additonal Settings for special sources ############## if entry['source_name'] == 'Hacker News': entry['link'] = item['comments'] ########################### if is_en(entry['title']): entry['lang'] = 2 if 'published_parsed' in item: try: entry['time'] = datetime.fromtimestamp( mktime( item['published_parsed'])) + timedelta( hours=TZ_DELTA) except Exception as e: entry['time'] = entry['crawl_time'] print('Exception when published_parsed: {}'. format(e)) else: entry['time'] = entry['crawl_time'] if 'author' in item: entry['author'] = item['author'][0:20] if 'summary' in item: entry['description'] = item['summary'][0:500] if 'content' in item: entry['content'] = item['content'][0]['value'][ 0:15000] if entry['content'] == '' and 'summary' in item and len( item['summary']) > 0: entry['content'] = item['summary'][0:15000] for field in item['links']: if field['type'] == 'audio/mpeg': if field['href'].endswith('.mp3'): entry['audio'] = field['href'] if field['href'].endswith('.mp4'): entry['video'] = field['href'] #对于文章类entry才进行摘要、聚类、分类、标签 if source['form'] == 1: try: if entry['content'] != '': entry['photo'] = getImg(entry['content']) if len(entry['photo']) > 255: entry['photo'] = '' parser = HtmlParser.from_string( entry['content'], "", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words( LANGUAGE) for sentence in summarizer( parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break else: parser = HtmlParser.from_url( entry['link'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words( LANGUAGE) for sentence in summarizer( parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break entry['digest'] = entry['digest'][0:500] except Exception as e: print( 'Exception when getting digest: {}'.format( e)) features = get_features(entry['title'], entry['content']) try: entry['simhash'] = str(Simhash(features).value) nears = index.get_near_dups(Simhash(features)) if len(nears) > 0: entry['sim_count'] = len(nears) cursor.execute( 'select cluster from entries where id = %s', (int(nears[0]), )) near_cluster = cursor.fetchone()[0] entry['cluster'] = near_cluster else: global last_cluster_num entry['cluster'] = last_cluster_num last_cluster_num += 1 except Exception as e: print( 'Exception when clustering: {}'.format(e)) try: content2 = BeautifulSoup( entry['content'], "lxml").text.encode( 'gbk', 'ignore').decode( 'gbk')[0:AIP_MAX_LEN_CONTENT] if len(content2) == 0: if len(entry['digest']) > 0: content2 = entry['digest'] title2 = entry['title'][0:AIP_MAX_LEN_TITLE] keywords = client.keyword(title2, content2) topics = client.topic(title2, content2) i = 0 for item in topics['item']['lv1_tag_list']: cate1[i] = item['tag'] i += 1 if i > 2: break i = 0 for item in topics['item']['lv2_tag_list']: cate2[i] = item['tag'] i += 1 if i > 2: break i = 0 for item in keywords['items']: tag[i] = item['tag'] i += 1 if i > 4: break entry['cate11'] = cate1[0] entry['cate12'] = cate1[1] entry['cate13'] = cate1[2] entry['cate21'] = cate2[0] entry['cate22'] = cate2[1] entry['cate23'] = cate2[2] entry['tag1'] = tag[0] entry['tag2'] = tag[1] entry['tag3'] = tag[2] entry['tag4'] = tag[3] entry['tag5'] = tag[4] except Exception as e: print( 'Exception when categorizing and tagging: {}' .format(e)) elif source['form'] == 2: entry['photo'] = getWeiboImg(entry['content']) entry['digest'] = filterWeiboTags(entry['content']) if len(entry['digest']) > 500: entry['digest'] = entry['digest'][0:500] elif source['form'] == 4: if entry['link'].startswith( 'https://www.bilibili.com/video'): entry['video_frame'] = 'http://player.bilibili.com/player.html?aid=' + \ entry['link'][33:] try: cursor.execute(add_entry, entry) conn.commit() index.add(str(cursor.lastrowid), Simhash(features)) except Exception as e: print('Exception when add entry: {}'.format(e)) except Exception as e: print("Unexpected Error: {}".format(e)) except Exception as e: print("Unexpected Error: {}".format(e)) # print(d['feed']['title']) elapsed = time.clock() - start print('time used: ' + str(elapsed)) # 关闭Cursor和Connection: cursor.close()
def download_sources(summarize=True, sources=currentFeeds): raw_documents = [] complete_urls = [] # Download News Stories converter = html2text.HTML2Text() converter.ignore_links = True converter.ignore_images = True converter.bypass_tables = True count_error = 0 document_count = 0 feed_count = -1 for url in currentFeeds: feed_count += 1 current_feed_document = 0 currentStories = [] feed = feedparser.parse(url[1]) for story in feed.entries: current_feed_document += 1 if story.title.startswith(u'VIDEO:') or story.title.startswith( u'AUDIO'): continue if story.link in complete_urls: continue try: res = requests.get(story.link) html = res.text title = story.title.encode('utf-8') completion = ( (feed_count + (current_feed_document / float(len(feed.entries)))) / (float(len(currentFeeds)))) * 100 print "[" + ("%.2f" % completion) + "%] \t " + feed.feed.title.encode( 'utf-8') + " - " + title raw_text = converter.handle(html) if summarize: parser = HtmlParser.from_string(html, None, Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sum_text = [ sentence for sentence in summarizer(parser.document, 20) ] raw_text = (" ".join([ str(sentence) for sentence in sum_text ])).decode('utf-8') # print raw_text stats = TextBlob(raw_text) currentStories.append( (title, raw_text, story.link, stats.sentiment, story.published_parsed)) complete_urls.append(story.link) document_count += 1 except KeyboardInterrupt: print "Quitting from Keyboard Interrupt." sys.exit(0) except: count_error += 1 print "\t Error occurred while processing that story:", sys.exc_info( )[0] traceback.print_exc() raw_documents.append((url[0], currentStories)) print "Received", document_count, "documents with", count_error, "errors" return raw_documents
def summarize(entry, count): clean = lambda sentence: re.sub(r' (?:[;,:.!?])', '', unicode(sentence)) parser = HtmlParser.from_string(entry.content, entry.url, tokenizer) sentences = map(clean, summarizer(parser.document, count)) return '<ul>{}</ul>'.format(''.join( '<li>{}</li>'.format(sentence) for sentence in sentences))
def crawl(): print(datetime.now()) cursor = conn.cursor() cursor.execute("SET NAMES utf8mb4") cursor.execute( 'select id, name, feedUrl, lang, form, content_rss from sources where mod(id, 30)=mod(%s, 30)', (datetime.now().minute, )) sources = cursor.fetchall() start = time.clock() for source in sources: # if source['id']%30 == datetime.now().minute%30: print(source[0]) source = { 'id': source[0], 'name': source[1], 'feedUrl': source[2].replace("188.131.178.76", "127.0.0.1"), 'lang': source[3], 'form': source[4], 'content_rss': source[5] } print(source['name']) LANGUAGE = 'chinese' if source['lang'] == 2: LANGUAGE = 'english' items = feedparser.parse(source['feedUrl'])['items'] for item in items: cursor.execute('select 1 from entries where link = %s limit 1', (item['link'], )) results = cursor.fetchall() if (not results) or (len(results) == 0): entry = { 'title': item['title'], 'link': item['link'], 'source_id': source['id'], 'source_name': source['name'], 'time': datetime.fromtimestamp(mktime(item['published_parsed'])) + timedelta(hours=TZ_DELTA), 'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'photo': '', 'lang': source['lang'], 'author': item['author'], 'description': '', 'digest': '', 'content': '' } if 'content' in item: entry['content'] = item['content'][0]['value'] if entry['content'] == '': entry['content'] = item['summary'] if entry['content'] != '': entry['photo'] = getImg(entry['content']) if source['form'] == 1: if source['content_rss'] == 1 and entry['content'] != '': parser = HtmlParser.from_string( entry['content'], "", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break else: parser = HtmlParser.from_url(entry['link'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): entry['digest'] += str(sentence) if len(entry['digest']) >= 500: break entry['digest'] = entry['digest'][0:500] cursor.execute(add_entry, entry) conn.commit() # print(d['feed']['title']) elapsed = time.clock() - start print('time used: ' + str(elapsed)) # 关闭Cursor和Connection: cursor.close()
def download_sources(summarize=True, sources=currentFeeds): raw_documents = [] complete_urls = [] # Download News Stories converter = html2text.HTML2Text() converter.ignore_links = True converter.ignore_images = True converter.bypass_tables = True count_error = 0 document_count = 0 feed_count = -1 for url in currentFeeds: feed_count += 1 current_feed_document = 0 currentStories = [] feed = feedparser.parse(url[1]) for story in feed.entries: current_feed_document += 1 if story.title.startswith(u'VIDEO:') or story.title.startswith(u'AUDIO'): continue if story.link in complete_urls: continue try: res = requests.get(story.link) html = res.text title = story.title.encode('utf-8') completion = ((feed_count + (current_feed_document / float(len(feed.entries)))) / (float(len(currentFeeds))))* 100 print "[" + ("%.2f" % completion) + "%] \t " + feed.feed.title.encode('utf-8') + " - " + title raw_text = converter.handle(html) if summarize: parser = HtmlParser.from_string(html, None, Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sum_text = [sentence for sentence in summarizer(parser.document, 20)] raw_text = (" ".join([str(sentence) for sentence in sum_text])).decode('utf-8') # print raw_text stats = TextBlob(raw_text) currentStories.append((title, raw_text, story.link, stats.sentiment, story.published_parsed)) complete_urls.append(story.link) document_count += 1 except KeyboardInterrupt: print "Quitting from Keyboard Interrupt." sys.exit(0) except: count_error += 1 print "\t Error occurred while processing that story:", sys.exc_info()[0] traceback.print_exc() raw_documents.append((url[0], currentStories)) print "Received", document_count, "documents with", count_error, "errors" return raw_documents
def summarize_html(self, content): parser = HtmlParser.from_string(content, "", Tokenizer(self.lang)) return self.__summarize(content, parser)