Python UnscrollClient Beispiele, unscroll.UnscrollClient Python Beispiele

Beispiel #1

0

Datei anzeigen

def __main__():
    def get_content():
        try:
            return open(FILE)
        except FileNotFoundError as e:
            resp = requests.get(URL)
            f = open(FILE, 'wb')
            f.write(resp.content)
            print(resp.content)
            f.close()
            return resp.content
        return None

    c = get_content()
    soup = bs4.BeautifulSoup(get_content(), "lxml")
    lis = soup.select('ul#container > li')
    events = [li_to_event(li) for li in lis]
    c = UnscrollClient()

    #    W3C Web Standards
    TITLE = 'W3C Web Standards'
    favthumb = c.cache_thumbnail(
        'https://2.bp.blogspot.com/-70GFD8HsG3I/VMKLC7IoiBI/AAAAAAAAIck/GCu0LIY3PCU/s1600/Logo%2BW3C.png'
    )
    c.delete_scroll_with_title(TITLE)
    c.__batch__(scroll_title=TITLE, thumbnail=favthumb['url'], events=events)
    print(len(events))

Beispiel #2

0

Datei anzeigen

Datei: dl_wikipedia.py Projekt: unscrollinc/unscroll

    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))

Beispiel #3

0

Datei anzeigen

def __main__():

    c = UnscrollClient(api='http://127.0.0.1:8000',
                       username='******',
                       password='******')
    c.login()
    c.create_or_retrieve_scroll('Google PR')
    get_blogspot_releases(c)

Beispiel #4

0

Datei anzeigen

def __main__():

    c = UnscrollClient()
    c.login()    
    c.delete_scroll_with_title('Amazon PR')

    thumbnail = 'http://media.corporate-ir.net/media_files/IROL/17/176060/img/logos/amazon_logo_RGB.jpg'
    favthumb = c.cache_thumbnail(thumbnail)

    scroll = c.create_or_retrieve_scroll('Amazon PR',
                                         description='A set of press releases from the Amazon Press Room.',
                                         link='http://phx.corporate-ir.net/phoenix.zhtml?c=176060&p=irol-news&nyo=0',
                                         citation='Amazon Press Room',
                                         with_thumbnail=favthumb.get('url'))
    print(scroll)

    get_releases(c, scroll)

Beispiel #5

0

Datei anzeigen

def load_data(begin=None,
              end=None,
              title=None,
              slug=None,
              thumbnail_url=None,
              delete=False):

    # Get the file listing
    _link = 'https://archive.org/details/{}'.format(slug)
    _r = requests.get('https://archive.org/metadata/{}'.format(slug))
    _data = _r.json()

    # Get metadata
    _md = _data.get('metadata')

    _title = title
    if (title is None):
        _title = _md.get('title')

    _description = _md.get('description')

    _events = extract_events(_data, 'MP3', _link, begin, end)

    _thumbnail_url = thumbnail_url
    if (thumbnail_url is None):
        _thumbnail_url = extract_thumbnail(_data)

    api = UnscrollClient()
    _thumb = api.cache_thumbnail(_thumbnail_url)
    _with_thumbnail = _thumb.get('url')

    if delete is True:
        api.delete_scroll_with_title(_title)

    print('XXXXXXX{}'.format(_title))
    scroll = api.create_or_retrieve_scroll(
        _title,
        subtitle='via Archive.org',
        public=True,
        description=_description,
        link=_link,
        citation='',
        with_thumbnail=_with_thumbnail,
    )

    for event in _events:
        pprint.pprint(event)
        j = api.create_event(event, scroll)
        pprint.pprint(j.json())

Beispiel #6

0

Datei anzeigen

Datei: mbox2event.py Projekt: unscrollinc/unscroll

def create(newsgroup, dir, maxyear):
    _title = '{}'.format(newsgroup)
    api = UnscrollClient()
    api.delete_scroll_with_title(_title)
    favthumb = api.cache_thumbnail(THUMBNAIL_URL)
    with_thumbnail = favthumb.get('url')
    scroll = api.create_or_retrieve_scroll(
        _title,
        description='Usenet message board archives',
        link='https://archive.org/details/usenethistorical',
        with_thumbnail=favthumb['url'], 
        subtitle='Collection via Usenet Historical Collection',        
    )
    newsgroup_to_events(newsgroup, scroll, api, dir, maxyear)

Beispiel #7

0

Datei anzeigen

def __main__():
    events = []
    title = 'IETF RFCs'
    c = UnscrollClient()
    c.delete_scroll_with_title('IETF RFCs')
    favthumb = c.cache_thumbnail(
        'https://ietf.org/media/images/ietf-logo.original.png')

    # Load RFCs
    read = ''
    with open('cache/rfc/rfc-index.xml', 'r') as f:
        read = f.read()
    parsed = xmltodict.parse(read)
    docs = parsed['rfc-index']['rfc-entry']
    events = [rfc_to_event(x) for x in docs]

    # Do it
    scroll = c.__batch__(scroll_title=title,
                         thumbnail=favthumb['url'],
                         events=events)
    print(len(events))

Beispiel #8

0

Datei anzeigen

Datei: terkel.py Projekt: unscrollinc/unscroll

def __main__():

    scroll_thumb = "https://upload.wikimedia.org/wikipedia/commons/0/0b/Studs_Terkel_-_1979-1.jpg"
    api = UnscrollClient()
    title = "Studs Terkel Interviews"
    favthumb = api.cache_thumbnail(scroll_thumb)
    with_thumbnail = favthumb.get('url')

    api.delete_scroll_with_title(title)

    scroll = api.create_or_retrieve_scroll(
        title,
        description='<b>Via the Studs Terkel Radio Archive at WFMT</b>: '
        'In his 45 years on WFMT radio, Studs Terkel talked to the 20th '
        'century’s most interesting people.',
        link='https://studsterkel.wfmt.com/',
        with_thumbnail=with_thumbnail,
        subtitle='Collection via WFMT',
    )

    post_shows(api, scroll)

Beispiel #9

0

Datei anzeigen

Datei: dl_pr_adobe.py Projekt: unscrollinc/unscroll

from bs4 import BeautifulSoup
import requests
from pprint import pprint
from unscroll import UnscrollClient
import datefinder
from random import random

ADOBE_URL = "http://news.adobe.com/views/ajax?js=1&page={}&view_name=bw_press_release&view_display_id=panel_pane_7&view_args=all%2Fall&view_path=news&view_base_path=null&view_dom_id=1&pager_element=0"

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')
c.login()

favicon_url = c.fetch_favicon_url('https://www.adobe.com')
favthumb = c.cache_thumbnail(favicon_url['url'])
c.create_or_retrieve_scroll('Adobe PR', thumbnail=favthumb['url'])

for i in range(1, 92):
    pr_url = ADOBE_URL.format(i, )
    r = requests.get(pr_url)
    r_as_data = r.json()
    r_html = r_as_data['display']
    parsed = BeautifulSoup(r_html, 'html.parser')
    els = parsed.find_all('div', class_='view-inner-wrapper')

    events = []

    for el in els:
        date_source = el.find('div', class_='views-field-created')
        date_source_txt = date_source.text

Beispiel #10

0

Datei anzeigen

from bs4 import BeautifulSoup
import requests
import favicon
from pprint import pprint
from unscroll import UnscrollClient
from dateparser import parse
import datefinder
from random import random
import re

APPLE_URL = 'https://www.apple.com'
APPLE_PR_URL = 'https://www.apple.com/pr/library'

c = UnscrollClient(api='http://127.0.0.1',
                   username='******',
                   password='******')

c.login()
favicon_url = c.fetch_favicon_url(APPLE_URL)
favthumb = c.cache_thumbnail(favicon_url['url'])
print(favthumb)

c.create_or_retrieve_scroll('Apple Press Releases, 2000-2017',
                            thumbnail=favthumb['url'])

for i in range(1, 66):
    pr_url = 'https://www.apple.com/newsroom/archive/?page={}'.format(i, )
    print(pr_url)
    r = requests.get(pr_url)
    parsed = BeautifulSoup(r.content, 'html.parser')
    dts = parsed.find_all('a', class_='result__item')

Beispiel #11

0

Datei anzeigen

Datei: dl_wikipedia.py Projekt: unscrollinc/unscroll

class WikipediaText():
    year = None
    events = []
    subject = None
    parsed = None
    unscroll_client = None
    scroll = None

    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))

    def tidy(self, txt=None):
        return re.sub('\[edit\]\s*', '', txt)

    def realday(self, monthname=None, day=None):
        month = MONTHS_HASH[monthname]
        day = int(day)
        return date(self.year, month, day)

    def wikihtml_to_event(self, date=None, wikihtml=None, kind=None):
        sup = wikihtml.find('sup')
        if sup is not None:
            _ = sup.extract()
        contents = [str(x) for x in wikihtml.children]
        joined = "".join(contents)
        linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined)
        targeted = re.sub(r'href=', 'target="_blank" href=', linked)

        bleached = bleach.clean(targeted,
                                tags=['b', 'i', 'strong', 'em'],
                                strip=True)
        pass1 = re.sub(MONTHS_PREFIX, '', bleached)
        pass2 = re.sub(MONTHS_PREFIX, '', pass1)
        lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2)

        titles = [
            x['title'] for x in wikihtml.find_all('a') if x.has_attr('title')
        ]
        filtered = [x for x in titles if not MONTH_REGEX.match(x)]

        title = None
        subject = None

        if len(filtered) == 0:
            title = " ".join(bleached.split(" ")[0:4]) + '...'
        else:
            title = filtered[0]
            subject = title

        thumbnail = None

        if subject is not None:
            image_d = self.unscroll_client.fetch_wiki_thumbnail_data(
                title=subject)
            image_url = image_d.get('url') if image_d is not None else None
            if image_url is not None:
                thumbnail_local = self.unscroll_client.cache_local(image_url)
                thumbnail_d = self.unscroll_client.post_thumbnail(
                    thumbnail_local)
                if thumbnail_d is not None:
                    thumbnail = thumbnail_d['url']

        if kind == 'birth':
            lastpass = '******'.format(lastpass)

        elif kind == 'death':
            lastpass = '******'.format(lastpass)

        ranking = 0
        if kind == 'world event':
            ranking = 0.9
        if kind == 'birth':
            ranking = 0.1
        if kind == 'death':
            ranking = 0.5

        dt = datetime.combine(date, datetime.max.time()).isoformat(' ')
        wiki_subject = None
        if subject is not None:
            subject = re.sub(r'\s', '_', subject)
            wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, )
        event = {
            'title': lastpass,
            'text': None,
            'resolution': 10,
            'ranking': ranking,
            'when_happened': dt,
            'when_original': None,
            'with_thumbnail': thumbnail,
            'content_url': wiki_subject,
            'source_url': self.wiki_url,
            'source_name': 'Wikipedia Event Pages',
            'content_type': kind
        }
        e = self.unscroll_client.create_event(event, self.scroll)
        pprint.pprint(e.json())
        return event

    def descend(self, ul=None, kind=None):
        last_date = None
        events = []
        for d in ul:
            if d.name == 'ul':
                pass
            elif d.name == 'li':
                t = re.findall(MONTHS_DAYS, d.text)
                if len(t) > 0:
                    last_date = t[0]
                    if not (d.find('ul')):
                        date = self.realday(monthname=last_date[0],
                                            day=last_date[1])
                        e = self.wikihtml_to_event(date=date,
                                                   wikihtml=d,
                                                   kind=kind)
                        # print("A: {}\n".format(e.get('title')))
                        events.append(e)
                elif last_date is not None:
                    date = self.realday(last_date[0], last_date[1])
                    e = self.wikihtml_to_event(date=date,
                                               wikihtml=d,
                                               kind=kind)
                    # print("B: {}\n".format(e.get('title')))
                    events.append(e)
        if len(events) > 0:
            return events

    def get_events(self):
        event_types = {
            '#Events': 'world event',
            '#Births': 'birth',
            '#Deaths': 'death'
        }
        events = []
        for keytype in event_types:
            try:
                events_h2 = self.parsed.select(keytype)[0].parent
                for event in events_h2.next_siblings:
                    if event.name == "h2":
                        break
                    else:
                        if event.name == "h3":
                            pass
                        if event.name == 'ul':
                            es = self.descend(ul=event.descendants,
                                              kind=event_types[keytype])
                            if es is not None:
                                events += es
            except IndexError:
                print('No {}'.format(keytype, ))
        return events

Beispiel #12

0

Datei anzeigen


pages = [
    'https://en.wikipedia.org/wiki/Timeline_of_events_preceding_World_War_II',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1939)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1940)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1941)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1942)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1943)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1944)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945%E2%80%931991)',
    'https://en.wikipedia.org/wiki/Timeline_of_the_Manhattan_Project'
]

api = UnscrollClient()
scroll = api.create_or_retrieve_scroll('WWII audio')
for page in pages:
    items = extract_list('', page)

    for item in items:
        if item is not None:
            thumb_url = None
            wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item'))
            if wiki_thumb is not None:
                thumb = api.cache_thumbnail(wiki_thumb.get('url'))
                if thumb is not None:
                    thumb_url = thumb.get('url')

            content_url = item.get('content_url')
            if content_url is None:

Beispiel #13

0

Datei anzeigen

Datei: imgtest.py Projekt: unscrollinc/unscroll

from unscroll import UnscrollClient

c = UnscrollClient()
p = c.cache_thumbnail(
    'https://upload.wikimedia.org/wikipedia/commons/b/b2/Donnchadh_mac_Gille-Brighdhe_Seal.jpg'
)
print(p)

p2 = c.fetch_wiki_thumbnail('George_Orwell')
print(p2)

Beispiel #14

0

Datei anzeigen

Datei: dl_met.py Projekt: unscrollinc/unscroll

def save_met():

    c = UnscrollClient()
    c.login()
    c.delete_scroll_with_title('The Met')
    scroll = c.create_or_retrieve_scroll('The Met')
    s = requests.Session()

    conn = sqlite3.connect('/home/unscroll/cache/met.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    sqlc.execute("SELECT * FROM collection LIMIT -1 OFFSET 0")

    for row in sqlc.fetchall():

        ud = UnscrollDate(row['date'], begin=-2000, end=2018)

        if ud.is_okay():
            with_thumbnail = None
            found = False
            img = row['image']
            local_img = re.sub(r'https?://images.metmuseum.org/', '', img)

            medium = ''
            if 'medium' in row:
                medium = ' ({})'.format(row['medium'])

            if img is not None and row['date'] is not None:
                local = '/home/unscroll/cache/met-images/{}'.format(
                    local_img, )
                if file_exists(local):
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                        d = {
                            'title':
                            row['title'] + medium,
                            'text':
                            row['description'],
                            'resolution':
                            ud.resolution,
                            'ranking':
                            0,
                            'content_url':
                            'https://www.metmuseum.org{}'.format(row['url'], ),
                            'with_thumbnail':
                            with_thumbnail,
                            'source_name':
                            'The Met',
                            'source_url':
                            'https://www.metmuseum.org/',
                            'when_happened':
                            ud.when_happened,
                            'when_original':
                            ud.when_original
                        }
                        e = c.create_event(d, scroll)
                        print(e)

Beispiel #15

0

Datei anzeigen

def __main__():

    c = UnscrollClient()
    c.login()
    favthumb = c.cache_thumbnail(THUMBNAIL_IMAGE)
    scroll = c.create_or_retrieve_scroll(
        'Cooper-Hewitt',
        description='Items from the Cooper Hewitt',
        link='https://github.com/cooperhewitt/collection',
        citation='Cooper-Hewitt Museum Collection',
        with_thumbnail=favthumb.get('url'))

    conn = sqlite3.connect('/home/unscroll/cache/cooper/objects.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    i = 0
    sqlc.execute("SELECT * FROM objects LIMIT -1 OFFSET {}".format(i))
    for row in sqlc.fetchall():

        if row['primary_image'] is not None and row['date'] is not None:
            # switch to the 300x300 thumbnail
            sq = re.sub('z\.jpg', 'sq.jpg', row['primary_image'])
            local_sq = re.sub(r'https?://', '', sq)
            local = '/home/unscroll/cache/cooper/{}'.format(local_sq, )

            i = i + 1
            found = False
            try:
                f = open(local, 'r')
                f.close()
                found = True
            except FileNotFoundError as e:
                try:
                    r = requests.get(sq)
                    p = pathlib.Path(local)
                    p.parent.mkdir(parents=True, exist_ok=True)
                    f = open(local, 'wb')
                    f.write(r.content)
                    f.close()
                    found = True
                except ConnectionError as e:
                    print('[cooperhewitt2.py] ConnectionError: {}'.format(e, ))

            print('{}: {}/{}'.format(i, local, found))

            ud = UnscrollDate(row['date'], begin=-4000, end=2018)
            if ud.is_okay():

                with_thumbnail = None
                if found:
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                d = {
                    'title':
                    row['title'],
                    'text':
                    row['description'],
                    'resolution':
                    ud.resolution,
                    'ranking':
                    0,
                    'content_url':
                    'https://collection.cooperhewitt.org/objects/{}/'.format(
                        row['id'], ),
                    'with_thumbnail':
                    with_thumbnail,
                    'source_name':
                    'Collection Data for Cooper Hewitt, Smithsonian Design Museum',
                    'source_url':
                    'https://github.com/cooperhewitt/collection',
                    'when_happened':
                    ud.when_happened,
                    'when_original':
                    ud.when_original
                }
                e = c.create_event(d, scroll)