Ejemplo n.º 1
0
def __main__():
    def get_content():
        try:
            return open(FILE)
        except FileNotFoundError as e:
            resp = requests.get(URL)
            f = open(FILE, 'wb')
            f.write(resp.content)
            print(resp.content)
            f.close()
            return resp.content
        return None

    c = get_content()
    soup = bs4.BeautifulSoup(get_content(), "lxml")
    lis = soup.select('ul#container > li')
    events = [li_to_event(li) for li in lis]
    c = UnscrollClient()

    #    W3C Web Standards
    TITLE = 'W3C Web Standards'
    favthumb = c.cache_thumbnail(
        'https://2.bp.blogspot.com/-70GFD8HsG3I/VMKLC7IoiBI/AAAAAAAAIck/GCu0LIY3PCU/s1600/Logo%2BW3C.png'
    )
    c.delete_scroll_with_title(TITLE)
    c.__batch__(scroll_title=TITLE, thumbnail=favthumb['url'], events=events)
    print(len(events))
Ejemplo n.º 2
0
    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))
Ejemplo n.º 3
0
def __main__():

    c = UnscrollClient(api='http://127.0.0.1:8000',
                       username='******',
                       password='******')
    c.login()
    c.create_or_retrieve_scroll('Google PR')
    get_blogspot_releases(c)
Ejemplo n.º 4
0
def __main__():

    c = UnscrollClient()
    c.login()    
    c.delete_scroll_with_title('Amazon PR')

    thumbnail = 'http://media.corporate-ir.net/media_files/IROL/17/176060/img/logos/amazon_logo_RGB.jpg'
    favthumb = c.cache_thumbnail(thumbnail)

    scroll = c.create_or_retrieve_scroll('Amazon PR',
                                         description='A set of press releases from the Amazon Press Room.',
                                         link='http://phx.corporate-ir.net/phoenix.zhtml?c=176060&p=irol-news&nyo=0',
                                         citation='Amazon Press Room',
                                         with_thumbnail=favthumb.get('url'))
    print(scroll)

    get_releases(c, scroll)
Ejemplo n.º 5
0
def load_data(begin=None,
              end=None,
              title=None,
              slug=None,
              thumbnail_url=None,
              delete=False):

    # Get the file listing
    _link = 'https://archive.org/details/{}'.format(slug)
    _r = requests.get('https://archive.org/metadata/{}'.format(slug))
    _data = _r.json()

    # Get metadata
    _md = _data.get('metadata')

    _title = title
    if (title is None):
        _title = _md.get('title')

    _description = _md.get('description')

    _events = extract_events(_data, 'MP3', _link, begin, end)

    _thumbnail_url = thumbnail_url
    if (thumbnail_url is None):
        _thumbnail_url = extract_thumbnail(_data)

    api = UnscrollClient()
    _thumb = api.cache_thumbnail(_thumbnail_url)
    _with_thumbnail = _thumb.get('url')

    if delete is True:
        api.delete_scroll_with_title(_title)

    print('XXXXXXX{}'.format(_title))
    scroll = api.create_or_retrieve_scroll(
        _title,
        subtitle='via Archive.org',
        public=True,
        description=_description,
        link=_link,
        citation='',
        with_thumbnail=_with_thumbnail,
    )

    for event in _events:
        pprint.pprint(event)
        j = api.create_event(event, scroll)
        pprint.pprint(j.json())
Ejemplo n.º 6
0
def create(newsgroup, dir, maxyear):
    _title = '{}'.format(newsgroup)
    api = UnscrollClient()
    api.delete_scroll_with_title(_title)
    favthumb = api.cache_thumbnail(THUMBNAIL_URL)
    with_thumbnail = favthumb.get('url')
    scroll = api.create_or_retrieve_scroll(
        _title,
        description='Usenet message board archives',
        link='https://archive.org/details/usenethistorical',
        with_thumbnail=favthumb['url'], 
        subtitle='Collection via Usenet Historical Collection',        
    )
    newsgroup_to_events(newsgroup, scroll, api, dir, maxyear)
Ejemplo n.º 7
0
def __main__():
    events = []
    title = 'IETF RFCs'
    c = UnscrollClient()
    c.delete_scroll_with_title('IETF RFCs')
    favthumb = c.cache_thumbnail(
        'https://ietf.org/media/images/ietf-logo.original.png')

    # Load RFCs
    read = ''
    with open('cache/rfc/rfc-index.xml', 'r') as f:
        read = f.read()
    parsed = xmltodict.parse(read)
    docs = parsed['rfc-index']['rfc-entry']
    events = [rfc_to_event(x) for x in docs]

    # Do it
    scroll = c.__batch__(scroll_title=title,
                         thumbnail=favthumb['url'],
                         events=events)
    print(len(events))
Ejemplo n.º 8
0
def __main__():

    scroll_thumb = "https://upload.wikimedia.org/wikipedia/commons/0/0b/Studs_Terkel_-_1979-1.jpg"
    api = UnscrollClient()
    title = "Studs Terkel Interviews"
    favthumb = api.cache_thumbnail(scroll_thumb)
    with_thumbnail = favthumb.get('url')

    api.delete_scroll_with_title(title)

    scroll = api.create_or_retrieve_scroll(
        title,
        description='<b>Via the Studs Terkel Radio Archive at WFMT</b>: '
        'In his 45 years on WFMT radio, Studs Terkel talked to the 20th '
        'century’s most interesting people.',
        link='https://studsterkel.wfmt.com/',
        with_thumbnail=with_thumbnail,
        subtitle='Collection via WFMT',
    )

    post_shows(api, scroll)
Ejemplo n.º 9
0
from bs4 import BeautifulSoup
import requests
from pprint import pprint
from unscroll import UnscrollClient
import datefinder
from random import random

ADOBE_URL = "http://news.adobe.com/views/ajax?js=1&page={}&view_name=bw_press_release&view_display_id=panel_pane_7&view_args=all%2Fall&view_path=news&view_base_path=null&view_dom_id=1&pager_element=0"

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')
c.login()

favicon_url = c.fetch_favicon_url('https://www.adobe.com')
favthumb = c.cache_thumbnail(favicon_url['url'])
c.create_or_retrieve_scroll('Adobe PR', thumbnail=favthumb['url'])

for i in range(1, 92):
    pr_url = ADOBE_URL.format(i, )
    r = requests.get(pr_url)
    r_as_data = r.json()
    r_html = r_as_data['display']
    parsed = BeautifulSoup(r_html, 'html.parser')
    els = parsed.find_all('div', class_='view-inner-wrapper')

    events = []

    for el in els:
        date_source = el.find('div', class_='views-field-created')
        date_source_txt = date_source.text
Ejemplo n.º 10
0
from bs4 import BeautifulSoup
import requests
import favicon
from pprint import pprint
from unscroll import UnscrollClient
from dateparser import parse
import datefinder
from random import random
import re

APPLE_URL = 'https://www.apple.com'
APPLE_PR_URL = 'https://www.apple.com/pr/library'

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')

c.login()
favicon_url = c.fetch_favicon_url(APPLE_URL)
favthumb = c.cache_thumbnail(favicon_url['url'])
print(favthumb)

c.create_or_retrieve_scroll('Apple Press Releases, 2000-2017',
                            thumbnail=favthumb['url'])

for i in range(1, 66):
    pr_url = 'https://www.apple.com/newsroom/archive/?page={}'.format(i, )
    print(pr_url)
    r = requests.get(pr_url)
    parsed = BeautifulSoup(r.content, 'html.parser')
    dts = parsed.find_all('a', class_='result__item')
Ejemplo n.º 11
0
class WikipediaText():
    year = None
    events = []
    subject = None
    parsed = None
    unscroll_client = None
    scroll = None

    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))

    def tidy(self, txt=None):
        return re.sub('\[edit\]\s*', '', txt)

    def realday(self, monthname=None, day=None):
        month = MONTHS_HASH[monthname]
        day = int(day)
        return date(self.year, month, day)

    def wikihtml_to_event(self, date=None, wikihtml=None, kind=None):
        sup = wikihtml.find('sup')
        if sup is not None:
            _ = sup.extract()
        contents = [str(x) for x in wikihtml.children]
        joined = "".join(contents)
        linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined)
        targeted = re.sub(r'href=', 'target="_blank" href=', linked)

        bleached = bleach.clean(targeted,
                                tags=['b', 'i', 'strong', 'em'],
                                strip=True)
        pass1 = re.sub(MONTHS_PREFIX, '', bleached)
        pass2 = re.sub(MONTHS_PREFIX, '', pass1)
        lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2)

        titles = [
            x['title'] for x in wikihtml.find_all('a') if x.has_attr('title')
        ]
        filtered = [x for x in titles if not MONTH_REGEX.match(x)]

        title = None
        subject = None

        if len(filtered) == 0:
            title = " ".join(bleached.split(" ")[0:4]) + '...'
        else:
            title = filtered[0]
            subject = title

        thumbnail = None

        if subject is not None:
            image_d = self.unscroll_client.fetch_wiki_thumbnail_data(
                title=subject)
            image_url = image_d.get('url') if image_d is not None else None
            if image_url is not None:
                thumbnail_local = self.unscroll_client.cache_local(image_url)
                thumbnail_d = self.unscroll_client.post_thumbnail(
                    thumbnail_local)
                if thumbnail_d is not None:
                    thumbnail = thumbnail_d['url']

        if kind == 'birth':
            lastpass = '******'.format(lastpass)

        elif kind == 'death':
            lastpass = '******'.format(lastpass)

        ranking = 0
        if kind == 'world event':
            ranking = 0.9
        if kind == 'birth':
            ranking = 0.1
        if kind == 'death':
            ranking = 0.5

        dt = datetime.combine(date, datetime.max.time()).isoformat(' ')
        wiki_subject = None
        if subject is not None:
            subject = re.sub(r'\s', '_', subject)
            wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, )
        event = {
            'title': lastpass,
            'text': None,
            'resolution': 10,
            'ranking': ranking,
            'when_happened': dt,
            'when_original': None,
            'with_thumbnail': thumbnail,
            'content_url': wiki_subject,
            'source_url': self.wiki_url,
            'source_name': 'Wikipedia Event Pages',
            'content_type': kind
        }
        e = self.unscroll_client.create_event(event, self.scroll)
        pprint.pprint(e.json())
        return event

    def descend(self, ul=None, kind=None):
        last_date = None
        events = []
        for d in ul:
            if d.name == 'ul':
                pass
            elif d.name == 'li':
                t = re.findall(MONTHS_DAYS, d.text)
                if len(t) > 0:
                    last_date = t[0]
                    if not (d.find('ul')):
                        date = self.realday(monthname=last_date[0],
                                            day=last_date[1])
                        e = self.wikihtml_to_event(date=date,
                                                   wikihtml=d,
                                                   kind=kind)
                        # print("A: {}\n".format(e.get('title')))
                        events.append(e)
                elif last_date is not None:
                    date = self.realday(last_date[0], last_date[1])
                    e = self.wikihtml_to_event(date=date,
                                               wikihtml=d,
                                               kind=kind)
                    # print("B: {}\n".format(e.get('title')))
                    events.append(e)
        if len(events) > 0:
            return events

    def get_events(self):
        event_types = {
            '#Events': 'world event',
            '#Births': 'birth',
            '#Deaths': 'death'
        }
        events = []
        for keytype in event_types:
            try:
                events_h2 = self.parsed.select(keytype)[0].parent
                for event in events_h2.next_siblings:
                    if event.name == "h2":
                        break
                    else:
                        if event.name == "h3":
                            pass
                        if event.name == 'ul':
                            es = self.descend(ul=event.descendants,
                                              kind=event_types[keytype])
                            if es is not None:
                                events += es
            except IndexError:
                print('No {}'.format(keytype, ))
        return events
Ejemplo n.º 12
0

pages = [
    'https://en.wikipedia.org/wiki/Timeline_of_events_preceding_World_War_II',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1939)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1940)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1941)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1942)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1943)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1944)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945%E2%80%931991)',
    'https://en.wikipedia.org/wiki/Timeline_of_the_Manhattan_Project'
]

api = UnscrollClient()
scroll = api.create_or_retrieve_scroll('WWII audio')
for page in pages:
    items = extract_list('', page)

    for item in items:
        if item is not None:
            thumb_url = None
            wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item'))
            if wiki_thumb is not None:
                thumb = api.cache_thumbnail(wiki_thumb.get('url'))
                if thumb is not None:
                    thumb_url = thumb.get('url')

            content_url = item.get('content_url')
            if content_url is None:
Ejemplo n.º 13
0
from unscroll import UnscrollClient

c = UnscrollClient()
p = c.cache_thumbnail(
    'https://upload.wikimedia.org/wikipedia/commons/b/b2/Donnchadh_mac_Gille-Brighdhe_Seal.jpg'
)
print(p)

p2 = c.fetch_wiki_thumbnail('George_Orwell')
print(p2)
Ejemplo n.º 14
0
def save_met():

    c = UnscrollClient()
    c.login()
    c.delete_scroll_with_title('The Met')
    scroll = c.create_or_retrieve_scroll('The Met')
    s = requests.Session()

    conn = sqlite3.connect('/home/unscroll/cache/met.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    sqlc.execute("SELECT * FROM collection LIMIT -1 OFFSET 0")

    for row in sqlc.fetchall():

        ud = UnscrollDate(row['date'], begin=-2000, end=2018)

        if ud.is_okay():
            with_thumbnail = None
            found = False
            img = row['image']
            local_img = re.sub(r'https?://images.metmuseum.org/', '', img)

            medium = ''
            if 'medium' in row:
                medium = ' ({})'.format(row['medium'])

            if img is not None and row['date'] is not None:
                local = '/home/unscroll/cache/met-images/{}'.format(
                    local_img, )
                if file_exists(local):
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                        d = {
                            'title':
                            row['title'] + medium,
                            'text':
                            row['description'],
                            'resolution':
                            ud.resolution,
                            'ranking':
                            0,
                            'content_url':
                            'https://www.metmuseum.org{}'.format(row['url'], ),
                            'with_thumbnail':
                            with_thumbnail,
                            'source_name':
                            'The Met',
                            'source_url':
                            'https://www.metmuseum.org/',
                            'when_happened':
                            ud.when_happened,
                            'when_original':
                            ud.when_original
                        }
                        e = c.create_event(d, scroll)
                        print(e)
Ejemplo n.º 15
0
def __main__():

    c = UnscrollClient()
    c.login()
    favthumb = c.cache_thumbnail(THUMBNAIL_IMAGE)
    scroll = c.create_or_retrieve_scroll(
        'Cooper-Hewitt',
        description='Items from the Cooper Hewitt',
        link='https://github.com/cooperhewitt/collection',
        citation='Cooper-Hewitt Museum Collection',
        with_thumbnail=favthumb.get('url'))

    conn = sqlite3.connect('/home/unscroll/cache/cooper/objects.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    i = 0
    sqlc.execute("SELECT * FROM objects LIMIT -1 OFFSET {}".format(i))
    for row in sqlc.fetchall():

        if row['primary_image'] is not None and row['date'] is not None:
            # switch to the 300x300 thumbnail
            sq = re.sub('z\.jpg', 'sq.jpg', row['primary_image'])
            local_sq = re.sub(r'https?://', '', sq)
            local = '/home/unscroll/cache/cooper/{}'.format(local_sq, )

            i = i + 1
            found = False
            try:
                f = open(local, 'r')
                f.close()
                found = True
            except FileNotFoundError as e:
                try:
                    r = requests.get(sq)
                    p = pathlib.Path(local)
                    p.parent.mkdir(parents=True, exist_ok=True)
                    f = open(local, 'wb')
                    f.write(r.content)
                    f.close()
                    found = True
                except ConnectionError as e:
                    print('[cooperhewitt2.py] ConnectionError: {}'.format(e, ))

            print('{}: {}/{}'.format(i, local, found))

            ud = UnscrollDate(row['date'], begin=-4000, end=2018)
            if ud.is_okay():

                with_thumbnail = None
                if found:
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                d = {
                    'title':
                    row['title'],
                    'text':
                    row['description'],
                    'resolution':
                    ud.resolution,
                    'ranking':
                    0,
                    'content_url':
                    'https://collection.cooperhewitt.org/objects/{}/'.format(
                        row['id'], ),
                    'with_thumbnail':
                    with_thumbnail,
                    'source_name':
                    'Collection Data for Cooper Hewitt, Smithsonian Design Museum',
                    'source_url':
                    'https://github.com/cooperhewitt/collection',
                    'when_happened':
                    ud.when_happened,
                    'when_original':
                    ud.when_original
                }
                e = c.create_event(d, scroll)