def run(url): data = summarize.summarize_page(url) title = data.title.split(' - BBC News')[0] dt = time.strftime('%Y-%m-%d %H:%M') category = "news" tags = "bbc" url = data.url content = "\n\n".join(data.summaries) slug = slugify(title) dr = time.strftime('%Y/%m/%d') full_things_of_file = os.path.join(CONTENT_DIR, dr, "%s.md" % slug) if os.path.exists(full_things_of_file): print('Report already exist.') sys.exit(1) if not os.path.exists(dr): os.makedirs(dr) with open(full_things_of_file, "a") as fx: fx.write("Title: %s\n" % title) fx.write("Date: %s\n" % dt) fx.write("Category: %s\n" % category) fx.write("Tags: %s\n" % tags) fx.write("Slug: %s\n" % slug) fx.write("Src: %s\n" % url) fx.write("Author: %s\n" % "jarvis") fx.write("\n") fx.write("\n") fx.write("%s" % content)
def generate_summary_from_url_without_json(url): url = str(url) first_para, last_para = summarize.get_first_and_last_para(url) summary = unicode(first_para) + '\n ' + unicode( summarize.summarize_page(url)) + '\n ' + unicode(last_para) final_summary = replace_unwanted_chars(summary) return nltk.sent_tokenize(final_summary)
def genTextSlide(ID, title, text): """Generates individual text slide.""" secID = ID rawText = text rawText.replace(' ', '') if rawText == "": return None bullets = summarize.summarize_page(text) slide = latexslides.BulletSlide(secID, bullets, block_heading = title) return slide
def incoming(): from_email = request.form.get('from', '') print(' * INCOMING EMAIL from', repr(from_email)) name, email = parseaddr(from_email) subject = request.form.get('subject') text = request.form.get('stripped-text') # Prevent infinite sends if email.lower() == parseaddr(app.config['EMAIL_SENDER'])[1].lower(): return '' # Validation if not email or not subject or not text: print(' * SKIPPING: Missing "from", "subject", or "text" field.') return '' # Find the URL matches = re.search('(?P<url>https?://[^\s]+)', text) url = matches.group('url') if matches else None if not url: print(' * SKIPPING: No URL found in the provided text.') return '' print('Summarizing:', url) try: summary = summarize_page(url) except Exception as ex: if sentry: sentry.captureException() print(' * ERROR:', type(ex), ex) subject = '[ERROR] ' + subject html = render_template('error_email.html', url=url) else: html = render_template('summary_email.html', title=summary.title, url=summary.url, summaries=summary.summaries) print('Replying to:', email) email_id = send_email(email, subject, html) print('Reply ID:', email_id) return ''
def mp_sz(wiki_url): def strip_wiki_extras(sent): return re.sub("(\s\(born.*?\d{4}\))|(\[.*?\])|(^\s)", "", sent) def abridge(sent): i = MAX_LEN while i < len(sent): if sent[i] == " ": return sent[:(i + 1)] + "..." i += 1 return sent if wiki_url: sum_arr = sz.summarize_page(wiki_url).summaries sum_clean_arr = [strip_wiki_extras(s) for s in sum_arr] summary = " ".join(sum_clean_arr) if len(summary) > MAX_LEN: summary = abridge(summary) return summary else: return None
def incoming(): from_email = request.form.get("from", "") print(" * INCOMING EMAIL from", repr(from_email)) name, email = parseaddr(from_email) subject = request.form.get("subject") text = request.form.get("stripped-text") # Prevent infinite sends if email.lower() == parseaddr(app.config["EMAIL_SENDER"])[1].lower(): return "" # Validation if not email or not subject or not text: print(' * SKIPPING: Missing "from", "subject", or "text" field.') return "" # Find the URL matches = re.search("(?P<url>https?://[^\s]+)", text) url = matches.group("url") if matches else None if not url: print(" * SKIPPING: No URL found in the provided text.") return "" print("Summarizing:", url) try: summary = summarize_page(url) except Exception as ex: if sentry: sentry.captureException() print(" * ERROR:", type(ex), ex) subject = "[ERROR] " + subject html = render_template("error_email.html", url=url) else: html = render_template("summary_email.html", title=summary.title, url=summary.url, summaries=summary.summaries) print("Replying to:", email) email_id = send_email(email, subject, html) print("Reply ID:", email_id) return ""
def run(url): data = summarize.summarize_page(url) title = data.title.split(' - Al Jazeera')[0] dt = time.strftime('%Y-%m-%d %H:%M') category = "news" tags = "aljazeera" url = data.url content = "\n\n".join(data.summaries) slug = slugify(title) if "error" in slug: return False print("Creating: %s" % slug) dr = os.path.join(CONTENT_DIR, time.strftime('%Y/%m/%d')) full_things_of_file = os.path.join(dr, "%s.md" % slug) if os.path.exists(full_things_of_file): print('Report already exist.') return False if not os.path.exists(dr): print('Date DIRs created.') os.makedirs(dr) with open(full_things_of_file, "a") as fx: fx.write("Title: %s\n" % title) fx.write("Date: %s\n" % dt) fx.write("Category: %s\n" % category) fx.write("Tags: %s\n" % tags) fx.write("Slug: %s\n" % slug) fx.write("Src: %s\n" % url) fx.write("Author: %s\n" % "jarvis") fx.write("\n") fx.write("\n") fx.write("%s" % content) print('Created: %s' % slug)
if __name__ == "__main__": subreddit = 'worldnews' r = requests.get( 'http://www.reddit.com/r/{}.json?sort=hot'.format(subreddit), headers={'user-agent': ''} ) rake = Rake(min_length = 1, max_length = 3) keywords = [] # print(json.dumps(r.json()['data']['children'][0])) for post in r.json()['data']['children']: try: if (not (post['data']['author'] == "null" or post['data']['url'] == "null")): #print(post['data']['url'].encode("utf-8")) # print(post['data']['permalink'].encode("utf-8")) # print(post['data']['author'].encode("utf-8")) # print(post['data']['thumbnail'].encode("utf-8")) # print(post['data']['title'].encode("utf-8")) rake.extract_keywords_from_text(post['data']['title'].encode("utf-8")) keywords = rake.get_ranked_phrases() # print(keywords) print(summarize_page(post['data']['url'].encode("utf-8"))) print ("----------------------------------") except ValueError: print ("parse error")