Ejemplo n.º 1
0
class WikipediaText():
    year = None
    events = []
    subject = None
    parsed = None
    unscroll_client = None
    scroll = None

    def __init__(self, year=None, subject=None):
        self.year = year
        self.subject = subject
        self.wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(year)
        if subject is not None:
            self.wiki_url = 'https://en.wikipedia.org/wiki/{}_in_{}'.format(
                year, subject)
        r = requests.get(self.wiki_url)
        self.parsed = BeautifulSoup(r.content, 'html.parser')
        self.unscroll_client = UnscrollClient()
        self.unscroll_client.login()

        favthumb = self.unscroll_client.cache_thumbnail(THUMBNAIL_URL)
        subject_title = subject
        if subject is None:
            subject_title = 'Review'
        self.scroll = self.unscroll_client.create_or_retrieve_scroll(
            'Wiki Years in {}'.format(subject_title),
            description='Events spidered from the English Wikipedia pages.',
            link='https://en.wikipedia.org/wiki/List_of_years',
            with_thumbnail=favthumb.get('url'))

    def tidy(self, txt=None):
        return re.sub('\[edit\]\s*', '', txt)

    def realday(self, monthname=None, day=None):
        month = MONTHS_HASH[monthname]
        day = int(day)
        return date(self.year, month, day)

    def wikihtml_to_event(self, date=None, wikihtml=None, kind=None):
        sup = wikihtml.find('sup')
        if sup is not None:
            _ = sup.extract()
        contents = [str(x) for x in wikihtml.children]
        joined = "".join(contents)
        linked = re.sub(r'/wiki/', 'http://en.wikipedia.org/wiki/', joined)
        targeted = re.sub(r'href=', 'target="_blank" href=', linked)

        bleached = bleach.clean(targeted,
                                tags=['b', 'i', 'strong', 'em'],
                                strip=True)
        pass1 = re.sub(MONTHS_PREFIX, '', bleached)
        pass2 = re.sub(MONTHS_PREFIX, '', pass1)
        lastpass = re.sub('^\s*\d+\s*[-–—]\s*', '', pass2)

        titles = [
            x['title'] for x in wikihtml.find_all('a') if x.has_attr('title')
        ]
        filtered = [x for x in titles if not MONTH_REGEX.match(x)]

        title = None
        subject = None

        if len(filtered) == 0:
            title = " ".join(bleached.split(" ")[0:4]) + '...'
        else:
            title = filtered[0]
            subject = title

        thumbnail = None

        if subject is not None:
            image_d = self.unscroll_client.fetch_wiki_thumbnail_data(
                title=subject)
            image_url = image_d.get('url') if image_d is not None else None
            if image_url is not None:
                thumbnail_local = self.unscroll_client.cache_local(image_url)
                thumbnail_d = self.unscroll_client.post_thumbnail(
                    thumbnail_local)
                if thumbnail_d is not None:
                    thumbnail = thumbnail_d['url']

        if kind == 'birth':
            lastpass = '******'.format(lastpass)

        elif kind == 'death':
            lastpass = '******'.format(lastpass)

        ranking = 0
        if kind == 'world event':
            ranking = 0.9
        if kind == 'birth':
            ranking = 0.1
        if kind == 'death':
            ranking = 0.5

        dt = datetime.combine(date, datetime.max.time()).isoformat(' ')
        wiki_subject = None
        if subject is not None:
            subject = re.sub(r'\s', '_', subject)
            wiki_subject = 'https://en.wikipedia.org/wiki/{}'.format(subject, )
        event = {
            'title': lastpass,
            'text': None,
            'resolution': 10,
            'ranking': ranking,
            'when_happened': dt,
            'when_original': None,
            'with_thumbnail': thumbnail,
            'content_url': wiki_subject,
            'source_url': self.wiki_url,
            'source_name': 'Wikipedia Event Pages',
            'content_type': kind
        }
        e = self.unscroll_client.create_event(event, self.scroll)
        pprint.pprint(e.json())
        return event

    def descend(self, ul=None, kind=None):
        last_date = None
        events = []
        for d in ul:
            if d.name == 'ul':
                pass
            elif d.name == 'li':
                t = re.findall(MONTHS_DAYS, d.text)
                if len(t) > 0:
                    last_date = t[0]
                    if not (d.find('ul')):
                        date = self.realday(monthname=last_date[0],
                                            day=last_date[1])
                        e = self.wikihtml_to_event(date=date,
                                                   wikihtml=d,
                                                   kind=kind)
                        # print("A: {}\n".format(e.get('title')))
                        events.append(e)
                elif last_date is not None:
                    date = self.realday(last_date[0], last_date[1])
                    e = self.wikihtml_to_event(date=date,
                                               wikihtml=d,
                                               kind=kind)
                    # print("B: {}\n".format(e.get('title')))
                    events.append(e)
        if len(events) > 0:
            return events

    def get_events(self):
        event_types = {
            '#Events': 'world event',
            '#Births': 'birth',
            '#Deaths': 'death'
        }
        events = []
        for keytype in event_types:
            try:
                events_h2 = self.parsed.select(keytype)[0].parent
                for event in events_h2.next_siblings:
                    if event.name == "h2":
                        break
                    else:
                        if event.name == "h3":
                            pass
                        if event.name == 'ul':
                            es = self.descend(ul=event.descendants,
                                              kind=event_types[keytype])
                            if es is not None:
                                events += es
            except IndexError:
                print('No {}'.format(keytype, ))
        return events
Ejemplo n.º 2
0
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1943)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1944)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945)',
    'https://en.wikipedia.org/wiki/Timeline_of_World_War_II_(1945%E2%80%931991)',
    'https://en.wikipedia.org/wiki/Timeline_of_the_Manhattan_Project'
]

api = UnscrollClient()
scroll = api.create_or_retrieve_scroll('WWII audio')
for page in pages:
    items = extract_list('', page)

    for item in items:
        if item is not None:
            thumb_url = None
            wiki_thumb = api.fetch_wiki_thumbnail_data(item.get('item'))
            if wiki_thumb is not None:
                thumb = api.cache_thumbnail(wiki_thumb.get('url'))
                if thumb is not None:
                    thumb_url = thumb.get('url')

            content_url = item.get('content_url')
            if content_url is None:
                content_url = page

            event = {
                'title': item.get('title'),
                'text': item.get('text'),
                'content_url': content_url,
                'source_url': page,
                'with_thumbnail': thumb_url,