Ejemplo n.º 1
0
from bs4 import BeautifulSoup
import requests
from pprint import pprint
from unscroll import UnscrollClient
import datetime
import re
from random import random

URL = 'https://news.microsoft.com/category/press-releases/page/{}/'

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')

c.login()
favicon_url = c.fetch_favicon_url('https://www.microsoft.com')
favthumb = c.cache_thumbnail(favicon_url['url'])
print(favthumb)

c.create_or_retrieve_scroll('Microsoft PR', thumbnail=favthumb['url'])

for i in range(1, 958):
    pr_url = URL.format(i, )
    print(pr_url)
    r = requests.get(pr_url)
    parsed = BeautifulSoup(r.content, 'html.parser')
    els = parsed.find_all('a', class_='f-post-link')

    events = []

    for el in els:
Ejemplo n.º 2
0
from bs4 import BeautifulSoup
import requests
from pprint import pprint
from unscroll import UnscrollClient
import datefinder
from random import random

ADOBE_URL = "http://news.adobe.com/views/ajax?js=1&page={}&view_name=bw_press_release&view_display_id=panel_pane_7&view_args=all%2Fall&view_path=news&view_base_path=null&view_dom_id=1&pager_element=0"

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')
c.login()

favicon_url = c.fetch_favicon_url('https://www.adobe.com')
favthumb = c.cache_thumbnail(favicon_url['url'])
c.create_or_retrieve_scroll('Adobe PR', thumbnail=favthumb['url'])

for i in range(1, 92):
    pr_url = ADOBE_URL.format(i, )
    r = requests.get(pr_url)
    r_as_data = r.json()
    r_html = r_as_data['display']
    parsed = BeautifulSoup(r_html, 'html.parser')
    els = parsed.find_all('div', class_='view-inner-wrapper')

    events = []

    for el in els:
        date_source = el.find('div', class_='views-field-created')
        date_source_txt = date_source.text
Ejemplo n.º 3
0
from pprint import pprint
from unscroll import UnscrollClient
from dateparser import parse
import datefinder
from random import random
import re

APPLE_URL = 'https://www.apple.com'
APPLE_PR_URL = 'https://www.apple.com/pr/library'

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')

c.login()
favicon_url = c.fetch_favicon_url(APPLE_URL)
favthumb = c.cache_thumbnail(favicon_url['url'])
print(favthumb)

c.create_or_retrieve_scroll('Apple Press Releases, 2000-2017',
                            thumbnail=favthumb['url'])

for i in range(1, 66):
    pr_url = 'https://www.apple.com/newsroom/archive/?page={}'.format(i, )
    print(pr_url)
    r = requests.get(pr_url)
    parsed = BeautifulSoup(r.content, 'html.parser')
    dts = parsed.find_all('a', class_='result__item')
    events = []
    for dt in dts:
        title = dt.find('h3').text