Python soup_me Exemples, CLIppy.soup_me Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_momi(theater, date):
    """Get movie names and times from Museum of the Moving Image's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://www.movingimage.us/visit/calendar/{}/day/type/1'

    soup = soup_me(BASE_URL.format(date.replace('-', '/')))

    PATTERN = re.compile('calendar/{}'.format(date.replace('-', '/')))
    movies = soup('a', href=PATTERN)

    movie_names = [
        m.find('span', class_=re.compile("^color")).text for m in movies
    ]

    movie_datetimes = [[
        DATETIME_SEP.join((date, (m.em.text.split(' | ')[0].replace('.', ''))))
    ] for m in movies]
    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #2

0

Afficher le fichier

def get_movies_quad(theater, date):
    """Get movie names and times from Quad's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://quadcinema.com/all/'

    soup = soup_me(BASE_URL)

    day, = [
        d for d in soup('div', class_='now-single-day')
        if convert_date(d.h1.text) == date
    ]

    movie_names = [movie.text for movie in day('h4')]
    movie_datetimes = [[
        '{} @ {}'.format(date, time.text.replace('.', ':'))
        for time in movie('li')
    ] for movie in day('div', class_='single-listing')]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #3

0

Afficher le fichier

def get_beerpages_en_masse(query):
    """Get beer pages via google search

    :returns: {site: beer_url} for subset of sites found
    """

    D_SITES = dict(untappd='https://untappd.com/b/',
                   ratebeer='https://www.ratebeer.com/beer/',
                   beeradvocate='https://www.beeradvocate.com/beer/')

    BASE_URL = 'https://www.google.com/search'
    PARAMS = {'q': safe_encode(query)}

    soup = soup_me(BASE_URL, PARAMS)

    extract_url = lambda site: re.sub(
        '\/url\?q=([^&]*)&.*', '\\1',
        soup.find('a', href=re.compile('^/url\?q={}'.format(site)))['href'])

    D_URLS = {}
    for k, v in D_SITES.items():
        try:
            D_URLS[k] = unquote(extract_url(v))  # fix % encoding
        except (TypeError):  # not found
            pass

    # return {k: extract_url(v) for k,v in D_SITES.items()}
    return D_URLS

Exemple #4

0

Afficher le fichier

def get_movies_cinema_village(theater, date):
    """Get movie names and times from Cinema Village's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.cinemavillage.com/showtimes/'

    soup = soup_me(BASE_URL)

    days = [
        day.contents[-1].strip().replace('.', '-')
        for day in soup('a', {'data-toggle': 'tab'})
    ]
    iday = index_into_days(days, date=date)

    day = soup.find('div', id='tab_default_{}'.format(iday))

    movie_names = [movie.text for movie in day('a')]
    movie_datetimes = [[
        '{} @ {}'.format(date, time.text) for time in times('span')
    ] for times in day('div', class_='sel-time')]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #5

0

Afficher le fichier

def get_movies_videology(theater, date):
    """Get movie names and times from Videology website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://videologybarandcinema.com/events/{}'

    soup = soup_me(BASE_URL.format(date))

    movie_names = [
        movie_div.a['title']
        for movie_div in soup('h2',
                              class_='tribe-events-list-event-title summary')
    ]

    # get times filtered by past
    movie_datetimes = [  # date @ time
        time_div.span.contents[0]
        for time_div in soup('div',
                             class_='tribe-updated published time-details')
    ]
    movie_times = filter_past(movie_datetimes)

    # filter movies with no future times
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #6

0

Afficher le fichier

def get_movies_coolidge(theater, date):
    """Get movie names and times from Coolidge Corner's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://coolidge.org/showtimes'
    PARAMS = {'date': date}

    soup = soup_me(BASE_URL, PARAMS)

    movies = soup('div', class_='film-card')

    movie_names = [m.h2.text for m in movies]

    movie_datetimes = [[
        '{} @ {}'.format(date, time.text)
        for time in m('span', class_='showtime-ticket__time')
    ] for m in movies]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #7

0

Afficher le fichier

def get_movies_film_noir(theater, date):
    """Get movie names and times from Film Noir website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.filmnoircinema.com/program'

    soup = soup_me(BASE_URL)

    date = dparser.parse(date)
    movie_divs = soup('a',
                      class_='eventlist-title-link',
                      href=re.compile('/program/{}/{}/{}/'.format(
                          date.year, date.month, date.day)))  # no zero-padding
    movie_names = [movie_div.text for movie_div in movie_divs]

    # get times filtered by past
    movie_datetimes = list(
        chain.from_iterable(([
            ' @ '.join((time_div['datetime'], time_div.text)) for time_div in
            movie_div.next.next.next('time', class_='event-time-12hr-start')
        ] for movie_div in movie_divs)))

    movie_times = filter_past(movie_datetimes)

    # filter movies with no future times
    # & combine times for same movie
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #8

0

Afficher le fichier

def get_movies_ifc(theater, date):
    """Get movie names and times from IFC's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://www.ifccenter.com/'

    soup = soup_me(BASE_URL)

    day, = [
        day for day in soup('div', class_=re.compile('^daily-schedule'))
        if day.h3.text != 'Coming Soon' and convert_date(day.h3.text) == date
    ]

    movie_divs = day('div')

    movie_names = [mdiv.h3.text for mdiv in movie_divs]
    movie_datetimes = [[
        '{} @ {}'.format(date, time.text) for time in mdiv('li')
    ] for mdiv in movie_divs]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #9

0

Afficher le fichier

def get_movies_somerville(theater, date):
    """Get movie names and times from Somerville Theater's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'

    soup = soup_me(BASE_URL)

    movies = soup('filmtitle')

    movie_names = [m.shortname.text
                   for m in movies]  # /or/ m.find('name').text

    convert = lambda date: date[-4:] + date[:-4]  # mmddyyyy -> yyyymmdd

    movie_datetimes = [
        [
            (
                dparser.parse(' '.join(
                    (convert(d.text), t.text)))  # yyyymmdd hhmm ->
                .strftime('%Y-%m-%d @ %l:%M%P'))  # yyyy-mm-dd @ hh:mm {a,p}m
            for d, t in zip(m('date'), m('time'))
            if d.text == convert_date(date, fmt_out='%m%d%Y')
        ] for m in movies
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #10

0

Afficher le fichier

def get_movies_hfa(theater, date):
    """Get movie names and times from Harvard Film Archive's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://harvardfilmarchive.org'

    soup = soup_me(BASE_URL)

    try:
        day, = [
            d for d in soup('div', class_='grid m-calendar__row')
            if d.time.attrs['datetime'] == date
        ]
    except (ValueError):  # no matching days
        return [], []

    movie_names = [m.text.strip() for m in day('h5')]

    movie_datetimes = [
        '{} @ {}'.format(date, time.text)
        for time in day('div', class_='event__time')
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #11

0

Afficher le fichier

def get_movies_nitehawk(theater, date):
    """Get movie names and times from Nitehawk's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://nitehawkcinema.com/{}/{}'

    D_THEATERS = {
        'nitehawk': 'williamsburg',
        'nitehawk prospect park': 'prospectpark'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('div', class_='show-title')]

    movie_datetimes = [
        [
            '{} @ {}'.format(date, clean_datetime(
                t.text.strip()))  # ignore any junk after {a,p}m
            for t in times('a', class_='showtime')
        ] for times in soup('div', class_='showtimes-container clearfix')
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #12

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_metrograph(theater, date):
    """Get movie names and times from Metrograph website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://metrograph.com/film'
    PARAMS = {'d': date}

    soup = soup_me(BASE_URL, PARAMS)

    movie_names = [
        movie_div.a.contents[0] for movie_div in soup('h4', class_='title')
    ]
    movie_times = [[time.contents[0] for time in time_div('a')]
                   for time_div in soup('div', class_='showtimes')]
    movie_formats = [
        specs.text.split(' / ')[-1] for specs in soup('span', class_='specs')
    ]

    # annotate with format
    movie_times = [(times if fmt == 'DCP' or not times or not fmt else times +
                    [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    # filter movies with no future times
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #13

0

Afficher le fichier

def get_movies_village_east_or_angelika(theater, date):
    """Get movie names and times from Village East Cinema or Angelika Film Center's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.{}/showtimes-and-tickets/now-playing/{}'

    D_THEATERS = {
        'village east cinema': 'citycinemas.com/villageeast',
        'angelika film center': 'angelikafilmcenter.com/nyc'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('h4', class_='name')]

    movie_datetimes = [[
        '{} @ {}'.format(date, time.attrs['value'])
        for time in times('input', class_='showtime reserved-seating')
    ] for times in soup('div', class_="showtimes-wrapper")]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #14

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_mfa(theater, date):
    """Get movie names and times from Museum of Fine Arts' website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.mfa.org/programs/film'

    PARAMS = {'field_date_value_1': date}

    soup = soup_me(BASE_URL, PARAMS)

    relevant_movies = [
        div for div in soup('div', class_='col-sm-8')
        if div.span and convert_date(div.span.contents[0]) == date
    ]
    movie_names = [m.a.text for m in relevant_movies]

    def convert(contentlst):
        date, _, timestr = contentlst
        start, end = timestr.split('–')
        return DATETIME_SEP.join((convert_date(date), start))

    movie_datetimes = [convert(m.span.contents) for m in relevant_movies]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #15

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_village_east_or_angelika(theater, date):
    """Get movie names and times from Village East Cinema or Angelika Film Center's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.{}/showtimes-and-tickets/now-playing/{}'

    D_THEATERS = {
        'village east cinema': 'citycinemas.com/villageeast',
        'angelika film center': 'angelikafilmcenter.com/nyc'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('h4', class_='name')]

    movie_statuses = [
        first((cls for cls in d['class'] if cls.startswith('status')))
        for d in soup('div', class_=re.compile('^status'))
    ]

    assert len(movie_names) == len(
        movie_statuses), f'{len(movie_names)} != {len(movie_statuses)}'

    # filter for currently playing only
    movie_names = [
        m for m, status in zip(movie_names, movie_statuses)
        if not status.endswith('coming_soon')
    ]

    if not movie_names:
        return [], []

    movie_datetimes = [[
        DATETIME_SEP.join((date, time['value']))
        for time in times('input', class_='showtime reserved-seating')
    ] for times in soup('div', class_="showtimes-wrapper")]

    movie_times = filter_past(movie_datetimes)

    # extract format from name, if any
    PATTERN = re.compile('in ((35|70)mm)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN, m)[:2]  # only name and (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    # annotate with format
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #16

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_amc(theater, date):
    """Get movie names and times from AMC's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.amctheatres.com/movie-theatres/{}/{}/showtimes/all/{}/{}/all'

    D_THEATERS = {
        'amc boston common': ('boston', 'amc-boston-common-19'),
        'the waterfront': ('pittsburgh', 'amc-waterfront-22')
    }
    theaterplace, theatername = D_THEATERS[theater.lower()]

    soup = soup_me(
        BASE_URL.format(theaterplace, theatername, date, theatername))

    movies = soup('div', class_='ShowtimesByTheatre-film')

    movie_names = [m.h2.text for m in movies]  #soup('h2')]

    movie_datetimes = [
        [
            [
                DATETIME_SEP.join((date, clean_time(time.text)))
                for time in times('div', class_='Showtime')
                if not time.find('div', {
                    'aria-hidden': "true"
                }).text == 'Sold Out'
            ]
            # TODO print sold-out times as xed-out ?
            for times in
            m('div', class_=re.compile('^Showtimes-Section Showtimes-Section'))
        ] for m in movies
    ]

    # flatten timelists for movies with multiple formats
    # TODO sometimes lists separate times for same format -- combine ?
    n_timelists_per_movie = [len(timelsts) for timelsts in movie_datetimes]
    movie_names = list(
        chain.from_iterable(
            [name] * n for name, n in zip(movie_names, n_timelists_per_movie)))
    movie_datetimes = flatten(movie_datetimes)

    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_formats = [[fmt.text for fmt in m('h4')] for m in movies]
    movie_times = [
        (times if fmt == 'Digital' or not times else times + [f'[ {fmt} ]'])
        for times, fmt in zip(movie_times, flatten(movie_formats))
    ]

    # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times)) # TODO combine does not know formats
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #17

0

Afficher le fichier

def get_beers(bar_url):
    """
    bar_url -> list of beernames, num on tap (listed 1st)
    """
    soup = soup_me(bar_url, from_headless=True)

    PATTERN = re.compile('^/beers/')
    beers = [
        li for li in soup('li', class_='pure-list-item')
        if li('a', href=PATTERN)
    ]

    beer_names = [beer.a.text for beer in beers]

    beer_infos = [
        (beer.find('p',
                   class_='caption text-gray mb-0').text.strip().split('·'))
        for beer in beers
    ]

    beer_servinginfos = [[
        p.text.strip().split()
        for p in beer('p', class_='caption text-right mb-0')
    ] for beer in beers]

    def make_tuple(lst, n):
        """ fill in missing info, and strip extraneous spaces
        """
        lst = list(lst) + [''] * n  # pad with fallback if missing
        return tuple(_.strip() for _ in lst)[:n]  # filter to n-tuple

    # TODO when info missing, detect abv ([0-9]%) and use to calibrate rest ?
    KEYS_INFO = ('style', 'abv', 'where')  #, 'serving')
    KEYS_SERVINGINFO = ('volume', 'type', 'price')

    # beerstyle, abv, beerplace
    beer_infos = [make_tuple(info, len(KEYS_INFO)) for info in beer_infos]
    # volume, servingtype, price
    beer_servinginfos = [[
        make_tuple(info, len(KEYS_SERVINGINFO))
        for info in filter(None, infos)
    ] for infos in beer_servinginfos]

    d_stats = {
        beername: dict(
            zip(
                chain(KEYS_INFO, ('serving', )),
                chain(info, ([
                    dict(zip(KEYS_SERVINGINFO, servinginfo))
                    for servinginfo in servinginfos
                ], ))))
        for beername, info, servinginfos in zip(beer_names, beer_infos,
                                                beer_servinginfos)
    }
    return d_stats

Exemple #18

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_moma(theater, date):
    """Get movie names and times from Museum of Modern Arts's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.moma.org/calendar/?utf8=%E2%9C%93&happening_filter=Films&date={}&location=both'

    soup = soup_me(BASE_URL.format(date))

    relevant_movies = [
        m
        for m in soup('div', class_='calendar-tile calendar-tile--tall-image')
        if date == convert_date((
            m.find('div', class_='center balance-text').text.replace(
                u'\xa0', ' ')  # &nbsp; -> " "
            .split(', ')[1]))  # extract month & day from full datetime
    ]

    nested_movie_names = [  # list per showing.. some have multiple films
        [m.text for m in ms.h3('em')] if ms.h3('em') else [ms.h3.text]
        for ms in relevant_movies
    ]
    movie_names = [ms[-1] for ms in nested_movie_names
                   ]  # main attraction is the last film

    movie_formats = [
        '+ {}'.format(','.join(ms[:-1])) if len(ms) > 1 else ''
        for ms in nested_movie_names
    ]

    PATTERN = re.compile('–[0-9]*:?[0-9]*')
    movie_datetimes = [
        (
            dparser.parse(
                re.sub(
                    PATTERN,
                    '',  # remove any time ranges
                    m.find('div',
                           class_='center balance-text').text)).strftime(
                               DATETIME_SEP.join(('%Y-%m-%d', '%l:%M%P')))
        )  # yyyy-mm-dd @ hh:mm {a,p}m
        for m in relevant_movies
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #19

0

Afficher le fichier

    def get_beerpage(query):
        """
        :query: beername query str
        :returns: (name, id)
        """
        PARAMS = {'q': safe_encode(query)}

        soup = soup_me(BASE_URL.format('search'), PARAMS)

        top_hit = soup.find('p', class_='name')

        return BASE_URL.format(top_hit.a['href'])

Exemple #20

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_nitehawk(theater, date):
    """Get movie names and times from Nitehawk's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://nitehawkcinema.com/{}/{}'

    D_THEATERS = {
        'nitehawk': 'williamsburg',
        'nitehawk prospect park': 'prospectpark'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('div', class_='show-title')]

    if not movie_names:
        return [], []

    # extract format from name, if any
    PATTERN = re.compile(' \(.*(DCP|(35|70)mm)\)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN,
                           m)[:2]  # only name and DCP / (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    movie_datetimes = [
        [
            DATETIME_SEP.join((
                date,
                clean_time((
                    t.contents[0]  # ignore any junk after {a,p}m
                    .strip().lower().replace(
                        'midnite', '11:59pm')))))  # else, wld be next day
            for t in times('a', class_='showtime')
        ] for times in soup('div', class_='showtimes-container clearfix')
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_times = [(times if fmt == 'dcp' or not times or not fmt else times +
                    [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times))
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #21

0

Afficher le fichier

def get_movies_showtimes(theater, date):
    """Get movie names and times from Showtimes' website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.showtimes.com/movie-theaters/{}'

    D_THEATERS = {
        'regal fenway': lambda *args: 'regal-fenway-stadium-13-rpx-6269',
        'ua court st': lambda *args: 'ua-court-street-stadium-12-rpx-6608'
    }

    try:
        soup = soup_me(
            BASE_URL.format(
                D_THEATERS.get(theater.lower(), get_theaterpg_showtimes)(
                    theater)))  # fallback for unlisted theater
        # (phrased as functions, so theaterpg scraper won't run until necessary)

        movies = soup('li', class_='movie-info-box')

    except (Exception) as e:
        print(error_str.format(e))  # error msg only
        movies = []  # no matching theater

    movie_names = [
        ''.join((re.sub('[\r\n].*', '', name.text.strip())
                 for name in m('h2', class_='media-heading'))) for m in movies
    ]

    nested_buttons = [  # [[day, time, time, day, time], ..] -> [[[day, time, time], [day, time]], ..]
        list(
            split_before((button.text
                          for button in m('button', type='button')),
                         lambda txt: ',' in txt)) for m in movies
    ]

    movie_datetimes = [
        flatten(
            [['{} @ {}'.format(day.replace(':', ''), time) for time in times]
             for day, *times in buttons
             if (convert_date(day.replace(':', '')) == date)])
        for buttons in nested_buttons
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #22

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_somerville(theater, date):
    """Get movie names and times from Somerville Theater's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'

    soup = soup_me(BASE_URL)

    movies = soup('filmtitle')

    movie_names = [m.shortname.text
                   for m in movies]  # /or/ m.find('name').text

    PATTERN = re.compile(' ((35|70)mm)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN, m)[:2]  # only name and (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    convert = lambda date: date[-4:] + date[:-4]  # mmddyyyy -> yyyymmdd

    movie_datetimes = [
        [
            (
                dparser.parse(' '.join(
                    (convert(d.text), t.text)))  # yyyymmdd hhmm ->
                .strftime(DATETIME_SEP.join(
                    ('%Y-%m-%d', '%l:%M%P'))))  # yyyy-mm-dd @ hh:mm {a,p}m
            for d, t in zip(m('date'), m('time'))
            if d.text == convert_date(date, fmt_out='%m%d%Y')
        ] for m in movies
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with formats
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #23

0

Afficher le fichier

def get_movies_pghfilmmakers(theater, date):
    """Get movie names and times from Pittsburgh Filmmakers website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://cinema.pfpca.org/films/showtimes?location={}'

    D_THEATERS = {
        'regent square theater': 24,
        'harris theater': 20,
        'melwood screening room': 18
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()]))

    # get date block
    try:
        block, = [
            day for day in soup('caption')
            if day.text == convert_date(date, fmt_out='%a, %b %-d')
        ]
    except (ValueError):  # indexing into empty list
        return [], []

    movie_names = [
        name.text
        for name in block.next.next.next('a', href=re.compile('/films/*'))
    ]

    movie_datetimes = [
        ' @ '.join((date, div.next.next.next.text.strip()))
        for div in block.next.next.next(
            'td', class_='views-field views-field-field-location')
    ]

    movie_times = filter_past(movie_datetimes)

    # filter movies with no future times
    # & combine times for same movie
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #24

0

Afficher le fichier

    def get_beerpage(query):
        """
        :query: beername query str
        :returns: (name, id)
        """
        PARAMS = {'q': safe_encode(query), 'qt': 'beer'}

        soup = soup_me(BASE_URL.format('search/'), PARAMS)

        try:  # redirected directly to beerpage
            relative_url = re.match(
                '^/beer/profile/',
                soup.find('input', attrs={'name': 'redirect'})['value']).string
        except:
            top_hit = soup.find('a', href=re.compile("^/beer/profile/"))
            relative_url = top_hit['href']

        return BASE_URL.format(relative_url)

Exemple #25

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_quad(theater, date):
    """Get movie names and times from Quad's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://quadcinema.com/all/'

    soup = soup_me(BASE_URL)

    try:
        day, = [
            d for d in soup('div', class_='now-single-day')
            if convert_date(d.h1.text) == date
        ]
    except (ValueError):  # no matching date listed yet
        return [], []

    movie_names = [movie.text for movie in day('h4')]

    movies = day('div', class_='single-listing')

    PATTERN = re.compile('^time')
    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text.replace('.', ':')))
        for time in m('li', class_=PATTERN)
    ] for m in movies]
    movie_times = filter_past(movie_datetimes)

    ANTIPATTERN = re.compile('^[^(time)]')  # non-showtime `li`s
    movie_formats = [[fmt.text for fmt in m('li', class_=ANTIPATTERN)]
                     for m in movies]

    # annotate with formats
    movie_times = [(times if not times or not fmt else times +
                    ['[ {} ]'.format(','.join(fmt))])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #26

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_syndicated(theater, date):
    """Get movie names and times from Syndicated's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://syndicatedbk.com/events/'

    soup = soup_me(BASE_URL)

    movie_strs = [
        div.text.strip()
        for div in soup('div',
                        id=re.compile(f'tribe-events-event-[0-9]*-{date}'))
    ]

    if not movie_strs or movie_strs[0].lower() == 'closed for private event':
        return [], []

    matches = [
        re.search(' \([0-9:]* [ap]m\)', movie_str, re.I)
        for movie_str in movie_strs
    ]

    movie_names = [
        movie_str[:m.start(0)]  # extract name
        for m, movie_str in zip(matches, movie_strs)
    ]

    movie_datetimes = [
        DATETIME_SEP.join((date, time)) for time in (
            movie_str[m.start(0) + 2:m.end(0) -
                      1]  # extract time (while removing trailing " (" & ")")
            for m, movie_str in zip(matches, movie_strs))
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #27

0

Afficher le fichier

Fichier : scrapers.py Projet : meereeum/cinematic

def get_movies_anthology(theater, date):
    """Get movie names and times from Anthology's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://anthologyfilmarchives.org/film_screenings/calendar?view=list'

    soup = soup_me(BASE_URL.format(date))

    days = soup('h3', class_='current-day')
    try:
        iday = index_into_days([
            ''.join((_ for _ in day.contents if isinstance(_, str))).strip()
            for day in days
        ],
                               date=date)
    except (AssertionError):  # no matching days
        return [], []

    border = (days[iday + 1]
              if iday < len(days) - 1 else soup.find('div', id='footer'))

    next_movies = days[iday].find_all_next('div', class_='showing-details')
    prev_movies = border.find_all_previous('div', class_='showing-details')

    movies = list(set(next_movies)
                  & set(prev_movies))  # get intersection b/w borders

    movie_names = [m.find('span', class_='film-title').text for m in movies]

    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text))
        for time in movie('a', {'name': re.compile("^showing-")})
    ] for movie in movies]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times

Exemple #28

0

Afficher le fichier

def get_movies_metrograph(theater, date):
    """Get movie names and times from Metrograph website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://metrograph.com/film'
    PARAMS = {'d': date}

    soup = soup_me(BASE_URL, PARAMS)

    movie_names = [
        movie_div.a.contents[0] for movie_div in soup('h4', class_='title')
    ]
    movie_times = [[time.contents[0] for time in time_div('a')]
                   for time_div in soup('div', class_='showtimes')]

    # filter movies with no future times
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times

Exemple #29

0

Afficher le fichier

def get_movies_loews_theater(theater, date):
    """Get movie names and times from Landmark Loew's Jersey website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://loewsjersey.org/calendar/?tribe-bar-date={}'

    soup = soup_me(BASE_URL.format(date[:-3]))  # yyy-mm

    movie_headers = [
        h for h in soup('h3', class_="tribe-events-month-event-title")
        if h.text.lower().startswith("film screening")
    ]

    relevant_movies = [
        h for h in movie_headers if h.parent.attrs['id'][-10:] == date
    ]

    if relevant_movies:
        movie_names = [
            h.text.replace('Film Screening: “', '').replace('”', '')
            for h in relevant_movies
        ]
        movie_datetimes = [
            json.loads(
                h.parent.attrs['data-tribejson'])['startTime']  # date @ time
            for h in relevant_movies
        ]

        movie_times = filter_past(movie_datetimes)
        movie_names, movie_times = combine_times(
            *filter_movies(movie_names, movie_times))

    else:
        movie_names, movie_times = [], []

    return movie_names, movie_times

Exemple #30

0

Afficher le fichier

def get_theaterpg_showtimes(theater):
    """Get theater page from Showtimes' website

    :theater: str
    :returns: partial URL (str)
    """
    BASE_URL = 'https://www.showtimes.com/search'

    PARAMS = {'query': safe_encode(theater)}

    soup = soup_me(BASE_URL, PARAMS)

    PATTERN = re.compile('^/movie-theaters/')

    theater_pgs = [
        re.sub(PATTERN, '', hit.attrs['href'])
        for hit in soup('a', href=PATTERN)
    ]
    try:
        return theater_pgs[0]  # best hit
    except (IndexError) as e:
        e.args = ('No matching theater on showtimes.com', )
        raise (e)