def run(url):
    data = summarize.summarize_page(url)

    title = data.title.split(' - BBC News')[0]
    dt = time.strftime('%Y-%m-%d %H:%M')
    category = "news"
    tags = "bbc"
    url = data.url
    content = "\n\n".join(data.summaries)
    slug = slugify(title)

    dr = time.strftime('%Y/%m/%d')
    full_things_of_file = os.path.join(CONTENT_DIR, dr, "%s.md" % slug)

    if os.path.exists(full_things_of_file):
        print('Report already exist.')
        sys.exit(1)

    if not os.path.exists(dr):
        os.makedirs(dr)

    with open(full_things_of_file, "a") as fx:
        fx.write("Title: %s\n" % title)
        fx.write("Date: %s\n" % dt)
        fx.write("Category: %s\n" % category)
        fx.write("Tags: %s\n" % tags)
        fx.write("Slug: %s\n" % slug)
        fx.write("Src: %s\n" % url)
        fx.write("Author: %s\n" % "jarvis")
        fx.write("\n")
        fx.write("\n")
        fx.write("%s" % content)
def generate_summary_from_url_without_json(url):
    url = str(url)
    first_para, last_para = summarize.get_first_and_last_para(url)
    summary = unicode(first_para) + '\n ' + unicode(
        summarize.summarize_page(url)) + '\n ' + unicode(last_para)
    final_summary = replace_unwanted_chars(summary)
    return nltk.sent_tokenize(final_summary)
def genTextSlide(ID, title, text):
    """Generates individual text slide."""
    secID = ID
    rawText = text
    rawText.replace(' ', '')
    if rawText == "":
        return None
    bullets = summarize.summarize_page(text)
    slide = latexslides.BulletSlide(secID, bullets, block_heading = title)
    return slide
Beispiel #4
0
def incoming():
    from_email = request.form.get('from', '')
    print(' * INCOMING EMAIL from', repr(from_email))

    name, email = parseaddr(from_email)
    subject = request.form.get('subject')
    text = request.form.get('stripped-text')

    # Prevent infinite sends
    if email.lower() == parseaddr(app.config['EMAIL_SENDER'])[1].lower():
        return ''

    # Validation
    if not email or not subject or not text:
        print(' * SKIPPING: Missing "from", "subject", or "text" field.')
        return ''

    # Find the URL
    matches = re.search('(?P<url>https?://[^\s]+)', text)
    url = matches.group('url') if matches else None
    if not url:
        print(' * SKIPPING: No URL found in the provided text.')
        return ''
    print('Summarizing:', url)

    try:
        summary = summarize_page(url)
    except Exception as ex:
        if sentry:
            sentry.captureException()
        print(' * ERROR:', type(ex), ex)
        subject = '[ERROR] ' + subject
        html = render_template('error_email.html', url=url)
    else:
        html = render_template('summary_email.html',
                               title=summary.title,
                               url=summary.url,
                               summaries=summary.summaries)

    print('Replying to:', email)
    email_id = send_email(email, subject, html)

    print('Reply ID:', email_id)
    return ''
Beispiel #5
0
def mp_sz(wiki_url):
    def strip_wiki_extras(sent):
        return re.sub("(\s\(born.*?\d{4}\))|(\[.*?\])|(^\s)", "", sent)

    def abridge(sent):
        i = MAX_LEN
        while i < len(sent):
            if sent[i] == " ":
                return sent[:(i + 1)] + "..."
            i += 1
        return sent
    if wiki_url:
        sum_arr = sz.summarize_page(wiki_url).summaries
        sum_clean_arr = [strip_wiki_extras(s) for s in sum_arr]
        summary = " ".join(sum_clean_arr)
        if len(summary) > MAX_LEN:
            summary = abridge(summary)
        return summary
    else:
        return None
Beispiel #6
0
def incoming():
    from_email = request.form.get("from", "")
    print(" * INCOMING EMAIL from", repr(from_email))

    name, email = parseaddr(from_email)
    subject = request.form.get("subject")
    text = request.form.get("stripped-text")

    # Prevent infinite sends
    if email.lower() == parseaddr(app.config["EMAIL_SENDER"])[1].lower():
        return ""

    # Validation
    if not email or not subject or not text:
        print(' * SKIPPING: Missing "from", "subject", or "text" field.')
        return ""

    # Find the URL
    matches = re.search("(?P<url>https?://[^\s]+)", text)
    url = matches.group("url") if matches else None
    if not url:
        print(" * SKIPPING: No URL found in the provided text.")
        return ""
    print("Summarizing:", url)

    try:
        summary = summarize_page(url)
    except Exception as ex:
        if sentry:
            sentry.captureException()
        print(" * ERROR:", type(ex), ex)
        subject = "[ERROR] " + subject
        html = render_template("error_email.html", url=url)
    else:
        html = render_template("summary_email.html", title=summary.title, url=summary.url, summaries=summary.summaries)

    print("Replying to:", email)
    email_id = send_email(email, subject, html)

    print("Reply ID:", email_id)
    return ""
Beispiel #7
0
def run(url):
    data = summarize.summarize_page(url)

    title = data.title.split(' - Al Jazeera')[0]
    dt = time.strftime('%Y-%m-%d %H:%M')
    category = "news"
    tags = "aljazeera"
    url = data.url
    content = "\n\n".join(data.summaries)
    slug = slugify(title)

    if "error" in slug:
        return False

    print("Creating: %s" % slug)

    dr = os.path.join(CONTENT_DIR, time.strftime('%Y/%m/%d'))
    full_things_of_file = os.path.join(dr, "%s.md" % slug)

    if os.path.exists(full_things_of_file):
        print('Report already exist.')
        return False

    if not os.path.exists(dr):
        print('Date DIRs created.')
        os.makedirs(dr)

    with open(full_things_of_file, "a") as fx:
        fx.write("Title: %s\n" % title)
        fx.write("Date: %s\n" % dt)
        fx.write("Category: %s\n" % category)
        fx.write("Tags: %s\n" % tags)
        fx.write("Slug: %s\n" % slug)
        fx.write("Src: %s\n" % url)
        fx.write("Author: %s\n" % "jarvis")
        fx.write("\n")
        fx.write("\n")
        fx.write("%s" % content)

    print('Created: %s' % slug)
Beispiel #8
0

if __name__ == "__main__":
    subreddit = 'worldnews'

    r = requests.get(
        'http://www.reddit.com/r/{}.json?sort=hot'.format(subreddit),
        headers={'user-agent': ''}
    )

    rake = Rake(min_length = 1, max_length = 3)
    keywords = []
    # print(json.dumps(r.json()['data']['children'][0]))


    for post in r.json()['data']['children']:
        try:
            if (not (post['data']['author'] == "null" or post['data']['url'] == "null")):
                #print(post['data']['url'].encode("utf-8"))
                # print(post['data']['permalink'].encode("utf-8"))
                # print(post['data']['author'].encode("utf-8"))
                # print(post['data']['thumbnail'].encode("utf-8"))
                # print(post['data']['title'].encode("utf-8"))
                rake.extract_keywords_from_text(post['data']['title'].encode("utf-8"))
                keywords = rake.get_ranked_phrases()
                # print(keywords)
                print(summarize_page(post['data']['url'].encode("utf-8")))
                print ("----------------------------------")
        except ValueError:
            print ("parse error")