コード例 #1
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_cinema_village(theater, date):
    """Get movie names and times from Cinema Village's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.cinemavillage.com/showtimes/'

    soup = soup_me(BASE_URL)

    days = [
        day.contents[-1].strip().replace('.', '-')
        for day in soup('a', {'data-toggle': 'tab'})
    ]
    iday = index_into_days(days, date=date)

    day = soup.find('div', id=f'tab_default_{iday}')

    movie_names = [movie.text for movie in day('a')]
    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text)) for time in times('span')
    ] for times in day('div', class_='sel-time')]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #2
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_film_noir(theater, date):
    """Get movie names and times from Film Noir website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.filmnoircinema.com/program'

    soup = soup_me(BASE_URL)

    date = dparser.parse(date)
    movie_divs = soup(
        'a',
        class_='eventlist-title-link',
        href=re.compile(f'/program/{date.year}/{date.month}/{date.day}/'
                        ))  # no zero-padding
    movie_names = [movie_div.text for movie_div in movie_divs]

    # get times filtered by past
    movie_datetimes = list(
        chain.from_iterable(([
            DATETIME_SEP.join((time_div['datetime'], time_div.text))
            for time_div in movie_div.next.next.next(
                'time', class_='event-time-12hr-start')
        ] for movie_div in movie_divs)))

    movie_times = filter_past(movie_datetimes)

    # filter movies with no future times
    # & combine times for same movie
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #3
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_ifc(theater, date):
    """Get movie names and times from IFC's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://www.ifccenter.com/'

    soup = soup_me(BASE_URL)

    try:
        day, = [
            day for day in soup('div', class_=re.compile('^daily-schedule')) if
            day.h3.text != 'Coming Soon' and convert_date(day.h3.text) == date
        ]
    except (ValueError):  # no matching date listed yet
        return [], []

    movie_divs = day('div')

    movie_names = [mdiv.h3.text for mdiv in movie_divs]
    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text)) for time in mdiv('li')
    ] for mdiv in movie_divs]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #4
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_landmark(theater, date):
    """Get movie names and times from Kendall Landmark's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://movie-lmt.peachdigital.com/movies/GetFilmsByCinema/21/151'

    djson = json_me(BASE_URL)

    movie_names = [movie['Title'] for movie in djson['Result']]

    movie_datetimes = [
        flatten([[
            DATETIME_SEP.join((date, t['StartTime'])) for t in sesh['Times']
            if convert_date(sesh['DisplayDate']) == date
        ] for sesh in seshes])
        for seshes in (movie['Sessions'] for movie in djson['Result'])
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #5
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_momi(theater, date):
    """Get movie names and times from Museum of the Moving Image's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://www.movingimage.us/visit/calendar/{}/day/type/1'

    soup = soup_me(BASE_URL.format(date.replace('-', '/')))

    PATTERN = re.compile('calendar/{}'.format(date.replace('-', '/')))
    movies = soup('a', href=PATTERN)

    movie_names = [
        m.find('span', class_=re.compile("^color")).text for m in movies
    ]

    movie_datetimes = [[
        DATETIME_SEP.join((date, (m.em.text.split(' | ')[0].replace('.', ''))))
    ] for m in movies]
    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #6
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_filmlinc(theater, date):
    """Get movie names and times from Film at Lincoln Center's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.filmlinc.org/wp-content/themes/filmlinc/api-events.php'

    PARAMS = {'start': date, 'end': date}

    djson = json_me(BASE_URL, PARAMS)

    movie_names = [movie['title'] for movie in djson]

    movie_datetimes = [
        (
            datetime.fromtimestamp(movie['start'] / 1000)  # epoch (in ms) ->
            .strftime(DATETIME_SEP.join(
                ('%Y-%m-%d', '%l:%M%P'))))  # yyyy-mm-dd @ hh:mm {a,p}m
        for movie in djson
    ]
    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #7
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_hfa(theater, date):
    """Get movie names and times from Harvard Film Archive's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://harvardfilmarchive.org'

    soup = soup_me(BASE_URL)

    try:
        day, = [
            d for d in soup('div', class_='grid m-calendar__row')
            if d.time.attrs['datetime'] == date
        ]
    except (ValueError):  # no matching days
        return [], []

    movie_names = [m.text.strip() for m in day('h5')]

    movie_datetimes = [
        DATETIME_SEP.join((date, time.text))
        for time in day('div', class_='event__time')
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #8
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_village_east_or_angelika(theater, date):
    """Get movie names and times from Village East Cinema or Angelika Film Center's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.{}/showtimes-and-tickets/now-playing/{}'

    D_THEATERS = {
        'village east cinema': 'citycinemas.com/villageeast',
        'angelika film center': 'angelikafilmcenter.com/nyc'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('h4', class_='name')]

    movie_statuses = [
        first((cls for cls in d['class'] if cls.startswith('status')))
        for d in soup('div', class_=re.compile('^status'))
    ]

    assert len(movie_names) == len(
        movie_statuses), f'{len(movie_names)} != {len(movie_statuses)}'

    # filter for currently playing only
    movie_names = [
        m for m, status in zip(movie_names, movie_statuses)
        if not status.endswith('coming_soon')
    ]

    if not movie_names:
        return [], []

    movie_datetimes = [[
        DATETIME_SEP.join((date, time['value']))
        for time in times('input', class_='showtime reserved-seating')
    ] for times in soup('div', class_="showtimes-wrapper")]

    movie_times = filter_past(movie_datetimes)

    # extract format from name, if any
    PATTERN = re.compile('in ((35|70)mm)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN, m)[:2]  # only name and (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    # annotate with format
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #9
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_amc(theater, date):
    """Get movie names and times from AMC's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.amctheatres.com/movie-theatres/{}/{}/showtimes/all/{}/{}/all'

    D_THEATERS = {
        'amc boston common': ('boston', 'amc-boston-common-19'),
        'the waterfront': ('pittsburgh', 'amc-waterfront-22')
    }
    theaterplace, theatername = D_THEATERS[theater.lower()]

    soup = soup_me(
        BASE_URL.format(theaterplace, theatername, date, theatername))

    movies = soup('div', class_='ShowtimesByTheatre-film')

    movie_names = [m.h2.text for m in movies]  #soup('h2')]

    movie_datetimes = [
        [
            [
                DATETIME_SEP.join((date, clean_time(time.text)))
                for time in times('div', class_='Showtime')
                if not time.find('div', {
                    'aria-hidden': "true"
                }).text == 'Sold Out'
            ]
            # TODO print sold-out times as xed-out ?
            for times in
            m('div', class_=re.compile('^Showtimes-Section Showtimes-Section'))
        ] for m in movies
    ]

    # flatten timelists for movies with multiple formats
    # TODO sometimes lists separate times for same format -- combine ?
    n_timelists_per_movie = [len(timelsts) for timelsts in movie_datetimes]
    movie_names = list(
        chain.from_iterable(
            [name] * n for name, n in zip(movie_names, n_timelists_per_movie)))
    movie_datetimes = flatten(movie_datetimes)

    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_formats = [[fmt.text for fmt in m('h4')] for m in movies]
    movie_times = [
        (times if fmt == 'Digital' or not times else times + [f'[ {fmt} ]'])
        for times, fmt in zip(movie_times, flatten(movie_formats))
    ]

    # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times)) # TODO combine does not know formats
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times
コード例 #10
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_moma(theater, date):
    """Get movie names and times from Museum of Modern Arts's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.moma.org/calendar/?utf8=%E2%9C%93&happening_filter=Films&date={}&location=both'

    soup = soup_me(BASE_URL.format(date))

    relevant_movies = [
        m
        for m in soup('div', class_='calendar-tile calendar-tile--tall-image')
        if date == convert_date((
            m.find('div', class_='center balance-text').text.replace(
                u'\xa0', ' ')  #   -> " "
            .split(', ')[1]))  # extract month & day from full datetime
    ]

    nested_movie_names = [  # list per showing.. some have multiple films
        [m.text for m in ms.h3('em')] if ms.h3('em') else [ms.h3.text]
        for ms in relevant_movies
    ]
    movie_names = [ms[-1] for ms in nested_movie_names
                   ]  # main attraction is the last film

    movie_formats = [
        '+ {}'.format(','.join(ms[:-1])) if len(ms) > 1 else ''
        for ms in nested_movie_names
    ]

    PATTERN = re.compile('–[0-9]*:?[0-9]*')
    movie_datetimes = [
        (
            dparser.parse(
                re.sub(
                    PATTERN,
                    '',  # remove any time ranges
                    m.find('div',
                           class_='center balance-text').text)).strftime(
                               DATETIME_SEP.join(('%Y-%m-%d', '%l:%M%P')))
        )  # yyyy-mm-dd @ hh:mm {a,p}m
        for m in relevant_movies
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times
コード例 #11
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_nitehawk(theater, date):
    """Get movie names and times from Nitehawk's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://nitehawkcinema.com/{}/{}'

    D_THEATERS = {
        'nitehawk': 'williamsburg',
        'nitehawk prospect park': 'prospectpark'
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date))

    movie_names = [movie.text for movie in soup('div', class_='show-title')]

    if not movie_names:
        return [], []

    # extract format from name, if any
    PATTERN = re.compile(' \(.*(DCP|(35|70)mm)\)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN,
                           m)[:2]  # only name and DCP / (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    movie_datetimes = [
        [
            DATETIME_SEP.join((
                date,
                clean_time((
                    t.contents[0]  # ignore any junk after {a,p}m
                    .strip().lower().replace(
                        'midnite', '11:59pm')))))  # else, wld be next day
            for t in times('a', class_='showtime')
        ] for times in soup('div', class_='showtimes-container clearfix')
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with format
    movie_times = [(times if fmt == 'dcp' or not times or not fmt else times +
                    [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times))
    movie_names, movie_times = filter_movies(movie_names, movie_times)

    return movie_names, movie_times
コード例 #12
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_showtimes(theater, date):
    """Get movie names and times from Showtimes' website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.showtimes.com/movie-theaters/{}'

    D_THEATERS = {
        'regal fenway': lambda *args: 'regal-fenway-stadium-13-rpx-6269',
        'ua court st': lambda *args: 'ua-court-street-stadium-12-rpx-6608'
    }

    try:
        soup = soup_me(
            BASE_URL.format(
                D_THEATERS.get(theater.lower(), get_theaterpg_showtimes)(
                    theater)))  # fallback for unlisted theater
        # (phrased as functions, so theaterpg scraper won't run until necessary)

        movies = soup('li', class_='movie-info-box')

    except (Exception) as e:
        print(error_str.format(e))  # error msg only
        movies = []  # no matching theater

    movie_names = [
        ''.join((re.sub('[\r\n].*', '', name.text.strip())
                 for name in m('h2', class_='media-heading'))) for m in movies
    ]

    nested_buttons = [  # [[day, time, time, day, time], ..] -> [[[day, time, time], [day, time]], ..]
        list(
            split_before((button.text
                          for button in m('button', type='button')),
                         lambda txt: ',' in txt)) for m in movies
    ]

    movie_datetimes = [
        flatten([[
            DATETIME_SEP.join((day.replace(':', ''), time)) for time in times
        ] for day, *times in buttons
                 if (convert_date(day.replace(':', '')) == date)])
        for buttons in nested_buttons
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #13
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_somerville(theater, date):
    """Get movie names and times from Somerville Theater's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'

    soup = soup_me(BASE_URL)

    movies = soup('filmtitle')

    movie_names = [m.shortname.text
                   for m in movies]  # /or/ m.find('name').text

    PATTERN = re.compile(' ((35|70)mm)$', re.I)

    def extract_fmt(m):
        m, *fmt = re.split(PATTERN, m)[:2]  # only name and (35|70)mm, if any
        return m, ''.join(fmt).lower()  # (cleaned) movie name, movie fmt

    movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names))

    convert = lambda date: date[-4:] + date[:-4]  # mmddyyyy -> yyyymmdd

    movie_datetimes = [
        [
            (
                dparser.parse(' '.join(
                    (convert(d.text), t.text)))  # yyyymmdd hhmm ->
                .strftime(DATETIME_SEP.join(
                    ('%Y-%m-%d', '%l:%M%P'))))  # yyyy-mm-dd @ hh:mm {a,p}m
            for d, t in zip(m('date'), m('time'))
            if d.text == convert_date(date, fmt_out='%m%d%Y')
        ] for m in movies
    ]
    movie_times = filter_past(movie_datetimes)

    # annotate with formats
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #14
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_pghfilmmakers(theater, date):
    """Get movie names and times from Pittsburgh Filmmakers website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://cinema.pfpca.org/films/showtimes?location={}'

    D_THEATERS = {
        'regent square theater': 24,
        'harris theater': 20,
        'melwood screening room': 18
    }

    soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()]))

    # get date block
    try:
        block, = [
            day for day in soup('caption')
            if day.text == convert_date(date, fmt_out='%a, %b %-d')
        ]
    except (ValueError):  # indexing into empty list
        return [], []

    movie_names = [
        name.text
        for name in block.next.next.next('a', href=re.compile('/films/*'))
    ]

    movie_datetimes = [
        DATETIME_SEP.join((date, div.next.next.next.text.strip()))
        for div in block.next.next.next(
            'td', class_='views-field views-field-field-location')
    ]

    movie_times = filter_past(movie_datetimes)

    # filter movies with no future times
    # & combine times for same movie
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #15
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_quad(theater, date):
    """Get movie names and times from Quad's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://quadcinema.com/all/'

    soup = soup_me(BASE_URL)

    try:
        day, = [
            d for d in soup('div', class_='now-single-day')
            if convert_date(d.h1.text) == date
        ]
    except (ValueError):  # no matching date listed yet
        return [], []

    movie_names = [movie.text for movie in day('h4')]

    movies = day('div', class_='single-listing')

    PATTERN = re.compile('^time')
    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text.replace('.', ':')))
        for time in m('li', class_=PATTERN)
    ] for m in movies]
    movie_times = filter_past(movie_datetimes)

    ANTIPATTERN = re.compile('^[^(time)]')  # non-showtime `li`s
    movie_formats = [[fmt.text for fmt in m('li', class_=ANTIPATTERN)]
                     for m in movies]

    # annotate with formats
    movie_times = [(times if not times or not fmt else times +
                    ['[ {} ]'.format(','.join(fmt))])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #16
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_syndicated(theater, date):
    """Get movie names and times from Syndicated's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://syndicatedbk.com/events/'

    soup = soup_me(BASE_URL)

    movie_strs = [
        div.text.strip()
        for div in soup('div',
                        id=re.compile(f'tribe-events-event-[0-9]*-{date}'))
    ]

    if not movie_strs or movie_strs[0].lower() == 'closed for private event':
        return [], []

    matches = [
        re.search(' \([0-9:]* [ap]m\)', movie_str, re.I)
        for movie_str in movie_strs
    ]

    movie_names = [
        movie_str[:m.start(0)]  # extract name
        for m, movie_str in zip(matches, movie_strs)
    ]

    movie_datetimes = [
        DATETIME_SEP.join((date, time)) for time in (
            movie_str[m.start(0) + 2:m.end(0) -
                      1]  # extract time (while removing trailing " (" & ")")
            for m, movie_str in zip(matches, movie_strs))
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #17
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_manor(theater, date):
    """Get movie names and times from The Manor's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://plugin.retrieverapi.com/getSchedule'

    PARAMS = {'date': date}

    headers = {
        'Host': 'plugin.retrieverapi.com',
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
        'Accept': 'application/json',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Referer': 'https://plugin.retrieverapi.com/embed/4227729?print',
        'Authorization': 'Basic NDIyNzcyOToxMjM=',
        'DNT': '1',
        'Connection': 'keep-alive'
    }
    djson = json_me(BASE_URL, PARAMS, headers=headers)

    movies = djson['movies']

    movie_names = [m['movie_name'] for m in movies]

    movie_datetimes = [
        [
            (dparser.parse(show['date_time']).strftime(
                DATETIME_SEP.join(
                    ('%Y-%m-%d', '%l:%M%P'))))  # yyyy-mm-dd @ hh:mm {a,p}m
            for show in m['showtimes']
        ] for m in movies
    ]
    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #18
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_anthology(theater, date):
    """Get movie names and times from Anthology's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'http://anthologyfilmarchives.org/film_screenings/calendar?view=list'

    soup = soup_me(BASE_URL.format(date))

    days = soup('h3', class_='current-day')
    try:
        iday = index_into_days([
            ''.join((_ for _ in day.contents if isinstance(_, str))).strip()
            for day in days
        ],
                               date=date)
    except (AssertionError):  # no matching days
        return [], []

    border = (days[iday + 1]
              if iday < len(days) - 1 else soup.find('div', id='footer'))

    next_movies = days[iday].find_all_next('div', class_='showing-details')
    prev_movies = border.find_all_previous('div', class_='showing-details')

    movies = list(set(next_movies)
                  & set(prev_movies))  # get intersection b/w borders

    movie_names = [m.find('span', class_='film-title').text for m in movies]

    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text))
        for time in movie('a', {'name': re.compile("^showing-")})
    ] for movie in movies]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #19
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_cobble_hill(theater, date):
    """Get movie names and times from Cobble Hill Cinema's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://64785.formovietickets.com:2235/T.ASP?WCI=BT&Page=schedule&SelectedDate={}'

    soup = soup_me(BASE_URL.format(date.replace('-', '')))

    movie_names = [m.text for m in soup('a', class_='displaytitle')]

    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text + 'm'))
        for time in m('a', class_='showtime')
    ] for m in soup('div', class_='showings')]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #20
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_coolidge(theater, date):
    """Get movie names and times from Coolidge Corner's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://coolidge.org/showtimes'
    PARAMS = {'date': date}

    soup = soup_me(BASE_URL, PARAMS)

    movies = soup('div', class_='film-card')

    movie_names = [m.h2.text for m in movies]

    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text))
        for time in m('span', class_='showtime-ticket__time')
    ] for m in movies]
    movie_times = filter_past(movie_datetimes)

    PATTERN = re.compile('^film-program__title')
    is_relevant = lambda s: s.endswith('mm')
    movie_formats = [
        ', '.join((tag.text for tag in m('span', class_=PATTERN)
                   if is_relevant(tag.text))) for m in movies
    ]

    # annotate with format
    movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]'])
                   for times, fmt in zip(movie_times, movie_formats)]

    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #21
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_bam(theater, date):
    """Get movie names and times from BAM Rose Cinema's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.bam.org/Filmsection'

    soup = soup_me(BASE_URL)

    relevant_movies = soup(
        'div',
        {'data-sort-date': re.compile('^{}'.format(date.replace('-', '')))})
    movie_names = [
        m.find('div', class_='listModuleTitleMed listBlock').text
        for m in relevant_movies
    ]

    PATTERN = re.compile('[ap]m,?$', re.I)

    movie_sortedtimes = [
        sorted(  # not always time-ordered
            [time.text.strip().replace(',', '') for time in m('li')],
            key=lambda t: float(re.sub(PATTERN, '', t.replace(':', '.')))
        )  # 7:40PM -> 7.4
        for m in relevant_movies
    ]
    movie_datetimes = [[DATETIME_SEP.join((date, time)) for time in times]
                       for times in movie_sortedtimes]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #22
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_rowhouse(theater, date):
    """Get movie names and times from Row House Cinema's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://rowhousecinema.com/{}'

    soup = soup_me(BASE_URL.format(date))

    movies = soup('div', class_='showtimes-description')

    movie_names = [m.h2.text.strip() for m in movies]
    movie_datetimes = [[
        DATETIME_SEP.join((date, time.text.strip()))
        for time in m('a', class_='showtime')
    ] for m in movies]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times
コード例 #23
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_brattle(theater, date):
    """Get movie names and times from Brattle Theatre's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://www.brattlefilm.org/category/calendar-2'

    soup = soup_me(BASE_URL)

    # PATTERN = re.compile('^https://www.brattlefilm.org/{}'.format(
    #     date.replace('-', '/')))

    # relevant_movies = [
    #     movie for movie in soup('span', class_="calendar-list-item")
    #     if movie('a', href=PATTERN)]

    PATTERN = re.compile('y{} m{} d{}'.format(*date.split('-')))
    relevant_movies = soup('div', class_=PATTERN)

    VIRTUAL = 'category-virtual-programs'

    movie_names = [m.h2.text for m in relevant_movies]

    movie_formats_nested = [  # list of lists
        [
            tag.replace('tag-', '') for tag in m['class']
            if tag.startswith('tag-') or tag == VIRTUAL
        ] for m in relevant_movies
    ]
    movie_formats = [  # (filtered) list of strs
        ', '.join((fmt for fmt in fmts
                   if not fmt.lower() == name.lower().replace(' ', '')
                   ))  # sometimes, tags are just the movie name..
        for fmts, name in zip(movie_formats_nested, movie_names)
    ]

    relevant_movies, movie_formats, movie_names = zip(  # filter `hidden` (e.g. cancelled series)
        *((m, fmt, name)
          for m, fmt, name in zip(relevant_movies, movie_formats, movie_names)
          if not 'hidden' in fmt))

    if VIRTUAL not in movie_formats:

        # only last time is labeled explicitly -- assume rest are p.m. (unless already annotated)
        DEFAULT_TIME_OF_DAY = 'pm'
        PATTERN1 = re.compile('^([0-9: apm\.]*)', re.I)  # capture time
        PATTERN2 = re.compile(f'([apm\.]+) ?{DEFAULT_TIME_OF_DAY}',
                              re.I)  # rm extraneous
        movie_datetimes = [
            [
                DATETIME_SEP.join((
                    date,
                    re.sub(
                        PATTERN2,
                        r'\1',  # 2. strip extraneous default (i.e. if already labeled)
                        re.sub(
                            PATTERN1,
                            r'\1{}'.
                            format(  # 1. pad with default time just in case
                                DEFAULT_TIME_OF_DAY),
                            time))))
                for time in m.li.text.replace('at ', '').split(',')
            ] for m in relevant_movies
        ]
        movie_times = filter_past(movie_datetimes)

        PATTERN1 = re.compile('^[0-9:]*((p|a)m)?')  # time only
        PATTERN2 = re.compile('^[^a-z0-9]*(.*[a-z0-9])[^a-z0-9]*$'
                              )  # string format only (e.g. no parens)

        # capture extra showing info
        movie_formats_extra = [
            [
                re.sub(PATTERN2, r'\1',
                       re.sub(PATTERN1, '',
                              t))  # extract dirty format, then clean
                for t in ts
            ] for ts in movie_times
        ]

        # .. & further clean times
        movie_times = [[re.match(PATTERN1, t).group(0) for t in ts]
                       for ts in movie_times]
        # before possibly re-annotating (per-showtime)
        movie_times = [[
            t if not fmt else t + f' [ {fmt} ]' for t, fmt in zip(ts, fmts)
        ] for ts, fmts in zip(movie_times, movie_formats_extra)]

        # annotate with (per-movie) format
        movie_times = [
            (times if not times or not fmt else times + [f'[ {fmt} ]'])
            for times, fmt in zip(movie_times, movie_formats)
        ]

        movie_names, movie_times = combine_times(
            *filter_movies(movie_names, movie_times))

    else:  # strange days -- N.B. if one virtual, assume all virtual (for now)
        movie_times = [['virtual'] for _ in movie_names]

    return movie_names, movie_times
コード例 #24
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
 def convert(contentlst):
     date, _, timestr = contentlst
     start, end = timestr.split('–')
     return DATETIME_SEP.join((convert_date(date), start))
コード例 #25
0
ファイル: scrapers.py プロジェクト: meereeum/cinematic
def get_movies_film_forum(theater, date):
    """Get movie names and times from Film Forum's website

    :theater: str
    :date: str (yyyy-mm-dd) (default: today)
    :returns: (list of movie names, list of lists of movie times)
    """
    BASE_URL = 'https://filmforum.org/'

    soup = soup_me(BASE_URL, from_headless=True)
    # headers = {
    #     'Host': 'filmforum.org',
    #     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
    #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #     'Accept-Language': 'en-US,en;q=0.5',
    #     'Accept-Encoding': 'gzip, deflate, br',
    #     'Cookie': 'exp_last_visit=1540095881; exp_last_activity=1541986743; prod_last_visit=1567621614; prod_last_activity=1567704700; visid_incap_2071502=8iHssZTnTnSmmcBr3w91Wt3MQ10AAAAAQUIPAAAAAACxMng+kgllZnm0qc4wuBX7; prod_tracker=%7B%220%22%3A%22index%22%2C%221%22%3A%22film%2Faga%22%2C%222%22%3A%22index%22%2C%22token%22%3A%220e8a94586278438a8abd9a2e22f6d71dc58ef797d480e691f6f7d52135be3b8604fc9bc72b9f98e33959ea6c363f6da7%22%7D; incap_ses_139_2071502=/FOyW/1BcEwESBAC4NjtAeH/b10AAAAAu5KRM62+voKYu930nS4qZA==; prod_csrf_token=add79bc2b230529b1baee4c15e4742a3599b154f; incap_ses_529_2071502=LeJGc8MKg19kn678pmNXB3xGcV0AAAAAk5FGgxjtbO141Wfk/d5SNg==',
    #     'DNT': '1',
    #     'Connection': 'keep-alive',
    #     'Upgrade-Insecure-Requests': '1',
    #     'Cache-Control': 'max-age=0, no-cache',
    #     'If-Modified-Since': 'Thu, 05 Sep 2019 17:31:41 GMT',
    #     'Pragma': 'no-cache'
    # }

    # soup_me(BASE_URL) # first request is blocked by ROBOTS
    # sleep(5)
    # soup = soup_me(BASE_URL, headers=headers)

    try:
        assert not soup.meta.attrs.get('name',
                                       '').lower() == 'robots', 'robots'
    except (AssertionError) as e:
        print(error_str.format(e))  # error msg only
        return [], []  # blocked from getting movies :(

    days = [
        d.text
        for d in (soup.find('div', class_='sidebar-container').find_all('li'))
    ]
    iday = index_into_days(days, date=date)

    day = soup.find('div', id=f'tabs-{iday}')

    movie_names = [
        ''.join(
            (txt for txt in mdiv.contents
             if isinstance(txt, str))).strip()  # ignore txt in extra <span>s
        for mdiv in day('a', href=re.compile('^https://filmforum.org/film'))
    ]

    # N.B. could have modifier like "♪" after time
    PATTERN = re.compile('([0-9])\*?$')

    movie_datetimes = [
        [
            DATETIME_SEP.join(
                (date, re.sub(PATTERN, r'\1 pm',
                              time.text)))  # only AM is labeled explicitly
            for time in p('span', class_=None)
        ] for p in day('p')
    ]

    movie_times = filter_past(movie_datetimes)
    movie_names, movie_times = combine_times(
        *filter_movies(movie_names, movie_times))

    return movie_names, movie_times