Beispiel #1
0
def load_data(begin=None,
              end=None,
              title=None,
              slug=None,
              thumbnail_url=None,
              delete=False):

    # Get the file listing
    _link = 'https://archive.org/details/{}'.format(slug)
    _r = requests.get('https://archive.org/metadata/{}'.format(slug))
    _data = _r.json()

    # Get metadata
    _md = _data.get('metadata')

    _title = title
    if (title is None):
        _title = _md.get('title')

    _description = _md.get('description')

    _events = extract_events(_data, 'MP3', _link, begin, end)

    _thumbnail_url = thumbnail_url
    if (thumbnail_url is None):
        _thumbnail_url = extract_thumbnail(_data)

    api = UnscrollClient()
    _thumb = api.cache_thumbnail(_thumbnail_url)
    _with_thumbnail = _thumb.get('url')

    if delete is True:
        api.delete_scroll_with_title(_title)

    print('XXXXXXX{}'.format(_title))
    scroll = api.create_or_retrieve_scroll(
        _title,
        subtitle='via Archive.org',
        public=True,
        description=_description,
        link=_link,
        citation='',
        with_thumbnail=_with_thumbnail,
    )

    for event in _events:
        pprint.pprint(event)
        j = api.create_event(event, scroll)
        pprint.pprint(j.json())
Beispiel #2
0
class WikipediaText():
    year = None
    events = []
    subject = None
    parsed = None
    unscroll_client = None
    scroll = None

    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))

    def tidy(self, txt=None):
        return re.sub('\[edit\]\s*', '', txt)

    def realday(self, monthname=None, day=None):
        month = MONTHS_HASH[monthname]
        day = int(day)
        return date(self.year, month, day)

    def wikihtml_to_event(self, date=None, wikihtml=None, kind=None):
        sup = wikihtml.find('sup')
        if sup is not None:
            _ = sup.extract()
        contents = [str(x) for x in wikihtml.children]
        joined = "".join(contents)
        linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined)
        targeted = re.sub(r'href=', 'target="_blank" href=', linked)

        bleached = bleach.clean(targeted,
                                tags=['b', 'i', 'strong', 'em'],
                                strip=True)
        pass1 = re.sub(MONTHS_PREFIX, '', bleached)
        pass2 = re.sub(MONTHS_PREFIX, '', pass1)
        lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2)

        titles = [
            x['title'] for x in wikihtml.find_all('a') if x.has_attr('title')
        ]
        filtered = [x for x in titles if not MONTH_REGEX.match(x)]

        title = None
        subject = None

        if len(filtered) == 0:
            title = " ".join(bleached.split(" ")[0:4]) + '...'
        else:
            title = filtered[0]
            subject = title

        thumbnail = None

        if subject is not None:
            image_d = self.unscroll_client.fetch_wiki_thumbnail_data(
                title=subject)
            image_url = image_d.get('url') if image_d is not None else None
            if image_url is not None:
                thumbnail_local = self.unscroll_client.cache_local(image_url)
                thumbnail_d = self.unscroll_client.post_thumbnail(
                    thumbnail_local)
                if thumbnail_d is not None:
                    thumbnail = thumbnail_d['url']

        if kind == 'birth':
            lastpass = '******'.format(lastpass)

        elif kind == 'death':
            lastpass = '******'.format(lastpass)

        ranking = 0
        if kind == 'world event':
            ranking = 0.9
        if kind == 'birth':
            ranking = 0.1
        if kind == 'death':
            ranking = 0.5

        dt = datetime.combine(date, datetime.max.time()).isoformat(' ')
        wiki_subject = None
        if subject is not None:
            subject = re.sub(r'\s', '_', subject)
            wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, )
        event = {
            'title': lastpass,
            'text': None,
            'resolution': 10,
            'ranking': ranking,
            'when_happened': dt,
            'when_original': None,
            'with_thumbnail': thumbnail,
            'content_url': wiki_subject,
            'source_url': self.wiki_url,
            'source_name': 'Wikipedia Event Pages',
            'content_type': kind
        }
        e = self.unscroll_client.create_event(event, self.scroll)
        pprint.pprint(e.json())
        return event

    def descend(self, ul=None, kind=None):
        last_date = None
        events = []
        for d in ul:
            if d.name == 'ul':
                pass
            elif d.name == 'li':
                t = re.findall(MONTHS_DAYS, d.text)
                if len(t) > 0:
                    last_date = t[0]
                    if not (d.find('ul')):
                        date = self.realday(monthname=last_date[0],
                                            day=last_date[1])
                        e = self.wikihtml_to_event(date=date,
                                                   wikihtml=d,
                                                   kind=kind)
                        # print("A: {}\n".format(e.get('title')))
                        events.append(e)
                elif last_date is not None:
                    date = self.realday(last_date[0], last_date[1])
                    e = self.wikihtml_to_event(date=date,
                                               wikihtml=d,
                                               kind=kind)
                    # print("B: {}\n".format(e.get('title')))
                    events.append(e)
        if len(events) > 0:
            return events

    def get_events(self):
        event_types = {
            '#Events': 'world event',
            '#Births': 'birth',
            '#Deaths': 'death'
        }
        events = []
        for keytype in event_types:
            try:
                events_h2 = self.parsed.select(keytype)[0].parent
                for event in events_h2.next_siblings:
                    if event.name == "h2":
                        break
                    else:
                        if event.name == "h3":
                            pass
                        if event.name == 'ul':
                            es = self.descend(ul=event.descendants,
                                              kind=event_types[keytype])
                            if es is not None:
                                events += es
            except IndexError:
                print('No {}'.format(keytype, ))
        return events
Beispiel #3
0
def save_met():

    c = UnscrollClient()
    c.login()
    c.delete_scroll_with_title('The Met')
    scroll = c.create_or_retrieve_scroll('The Met')
    s = requests.Session()

    conn = sqlite3.connect('/home/unscroll/cache/met.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    sqlc.execute("SELECT * FROM collection LIMIT -1 OFFSET 0")

    for row in sqlc.fetchall():

        ud = UnscrollDate(row['date'], begin=-2000, end=2018)

        if ud.is_okay():
            with_thumbnail = None
            found = False
            img = row['image']
            local_img = re.sub(r'https?://images.metmuseum.org/', '', img)

            medium = ''
            if 'medium' in row:
                medium = ' ({})'.format(row['medium'])

            if img is not None and row['date'] is not None:
                local = '/home/unscroll/cache/met-images/{}'.format(
                    local_img, )
                if file_exists(local):
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                        d = {
                            'title':
                            row['title'] + medium,
                            'text':
                            row['description'],
                            'resolution':
                            ud.resolution,
                            'ranking':
                            0,
                            'content_url':
                            'https://www.metmuseum.org{}'.format(row['url'], ),
                            'with_thumbnail':
                            with_thumbnail,
                            'source_name':
                            'The Met',
                            'source_url':
                            'https://www.metmuseum.org/',
                            'when_happened':
                            ud.when_happened,
                            'when_original':
                            ud.when_original
                        }
                        e = c.create_event(d, scroll)
                        print(e)
Beispiel #4
0
scroll = api.create_or_retrieve_scroll('WWII audio')
for page in pages:
    items = extract_list('', page)

    for item in items:
        if item is not None:
            thumb_url = None
            wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item'))
            if wiki_thumb is not None:
                thumb = api.cache_thumbnail(wiki_thumb.get('url'))
                if thumb is not None:
                    thumb_url = thumb.get('url')

            content_url = item.get('content_url')
            if content_url is None:
                content_url = page

            event = {
                'title': item.get('title'),
                'text': item.get('text'),
                'content_url': content_url,
                'source_url': page,
                'with_thumbnail': thumb_url,
                'when_happened': item.get('when_happened'),
                'when_original': item.get('when_original'),
                'resolution': item.get('resolution'),
            }
            res = api.create_event(event, scroll)
            if res.status_code != 200:
                print(res.json())
Beispiel #5
0
def __main__():

    c = UnscrollClient()
    c.login()
    favthumb = c.cache_thumbnail(THUMBNAIL_IMAGE)
    scroll = c.create_or_retrieve_scroll(
        'Cooper-Hewitt',
        description='Items from the Cooper Hewitt',
        link='https://github.com/cooperhewitt/collection',
        citation='Cooper-Hewitt Museum Collection',
        with_thumbnail=favthumb.get('url'))

    conn = sqlite3.connect('/home/unscroll/cache/cooper/objects.db')
    conn.row_factory = sqlite3.Row

    sqlc = conn.cursor()

    i = 0
    sqlc.execute("SELECT * FROM objects LIMIT -1 OFFSET {}".format(i))
    for row in sqlc.fetchall():

        if row['primary_image'] is not None and row['date'] is not None:
            # switch to the 300x300 thumbnail
            sq = re.sub('z\.jpg', 'sq.jpg', row['primary_image'])
            local_sq = re.sub(r'https?://', '', sq)
            local = '/home/unscroll/cache/cooper/{}'.format(local_sq, )

            i = i + 1
            found = False
            try:
                f = open(local, 'r')
                f.close()
                found = True
            except FileNotFoundError as e:
                try:
                    r = requests.get(sq)
                    p = pathlib.Path(local)
                    p.parent.mkdir(parents=True, exist_ok=True)
                    f = open(local, 'wb')
                    f.write(r.content)
                    f.close()
                    found = True
                except ConnectionError as e:
                    print('[cooperhewitt2.py] ConnectionError: {}'.format(e, ))

            print('{}: {}/{}'.format(i, local, found))

            ud = UnscrollDate(row['date'], begin=-4000, end=2018)
            if ud.is_okay():

                with_thumbnail = None
                if found:
                    thumb = c.post_thumbnail(local)
                    if thumb is not None:
                        with_thumbnail = thumb.get('url')

                d = {
                    'title':
                    row['title'],
                    'text':
                    row['description'],
                    'resolution':
                    ud.resolution,
                    'ranking':
                    0,
                    'content_url':
                    'https://collection.cooperhewitt.org/objects/{}/'.format(
                        row['id'], ),
                    'with_thumbnail':
                    with_thumbnail,
                    'source_name':
                    'Collection Data for Cooper Hewitt, Smithsonian Design Museum',
                    'source_url':
                    'https://github.com/cooperhewitt/collection',
                    'when_happened':
                    ud.when_happened,
                    'when_original':
                    ud.when_original
                }
                e = c.create_event(d, scroll)