def get_movies_cinema_village(theater, date): """Get movie names and times from Cinema Village's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.cinemavillage.com/showtimes/' soup = soup_me(BASE_URL) days = [ day.contents[-1].strip().replace('.', '-') for day in soup('a', {'data-toggle': 'tab'}) ] iday = index_into_days(days, date=date) day = soup.find('div', id=f'tab_default_{iday}') movie_names = [movie.text for movie in day('a')] movie_datetimes = [[ DATETIME_SEP.join((date, time.text)) for time in times('span') ] for times in day('div', class_='sel-time')] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_film_noir(theater, date): """Get movie names and times from Film Noir website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.filmnoircinema.com/program' soup = soup_me(BASE_URL) date = dparser.parse(date) movie_divs = soup( 'a', class_='eventlist-title-link', href=re.compile(f'/program/{date.year}/{date.month}/{date.day}/' )) # no zero-padding movie_names = [movie_div.text for movie_div in movie_divs] # get times filtered by past movie_datetimes = list( chain.from_iterable(([ DATETIME_SEP.join((time_div['datetime'], time_div.text)) for time_div in movie_div.next.next.next( 'time', class_='event-time-12hr-start') ] for movie_div in movie_divs))) movie_times = filter_past(movie_datetimes) # filter movies with no future times # & combine times for same movie movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_ifc(theater, date): """Get movie names and times from IFC's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'http://www.ifccenter.com/' soup = soup_me(BASE_URL) try: day, = [ day for day in soup('div', class_=re.compile('^daily-schedule')) if day.h3.text != 'Coming Soon' and convert_date(day.h3.text) == date ] except (ValueError): # no matching date listed yet return [], [] movie_divs = day('div') movie_names = [mdiv.h3.text for mdiv in movie_divs] movie_datetimes = [[ DATETIME_SEP.join((date, time.text)) for time in mdiv('li') ] for mdiv in movie_divs] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_landmark(theater, date): """Get movie names and times from Kendall Landmark's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://movie-lmt.peachdigital.com/movies/GetFilmsByCinema/21/151' djson = json_me(BASE_URL) movie_names = [movie['Title'] for movie in djson['Result']] movie_datetimes = [ flatten([[ DATETIME_SEP.join((date, t['StartTime'])) for t in sesh['Times'] if convert_date(sesh['DisplayDate']) == date ] for sesh in seshes]) for seshes in (movie['Sessions'] for movie in djson['Result']) ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_momi(theater, date): """Get movie names and times from Museum of the Moving Image's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'http://www.movingimage.us/visit/calendar/{}/day/type/1' soup = soup_me(BASE_URL.format(date.replace('-', '/'))) PATTERN = re.compile('calendar/{}'.format(date.replace('-', '/'))) movies = soup('a', href=PATTERN) movie_names = [ m.find('span', class_=re.compile("^color")).text for m in movies ] movie_datetimes = [[ DATETIME_SEP.join((date, (m.em.text.split(' | ')[0].replace('.', '')))) ] for m in movies] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_filmlinc(theater, date): """Get movie names and times from Film at Lincoln Center's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.filmlinc.org/wp-content/themes/filmlinc/api-events.php' PARAMS = {'start': date, 'end': date} djson = json_me(BASE_URL, PARAMS) movie_names = [movie['title'] for movie in djson] movie_datetimes = [ ( datetime.fromtimestamp(movie['start'] / 1000) # epoch (in ms) -> .strftime(DATETIME_SEP.join( ('%Y-%m-%d', '%l:%M%P')))) # yyyy-mm-dd @ hh:mm {a,p}m for movie in djson ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_hfa(theater, date): """Get movie names and times from Harvard Film Archive's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://harvardfilmarchive.org' soup = soup_me(BASE_URL) try: day, = [ d for d in soup('div', class_='grid m-calendar__row') if d.time.attrs['datetime'] == date ] except (ValueError): # no matching days return [], [] movie_names = [m.text.strip() for m in day('h5')] movie_datetimes = [ DATETIME_SEP.join((date, time.text)) for time in day('div', class_='event__time') ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_village_east_or_angelika(theater, date): """Get movie names and times from Village East Cinema or Angelika Film Center's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.{}/showtimes-and-tickets/now-playing/{}' D_THEATERS = { 'village east cinema': 'citycinemas.com/villageeast', 'angelika film center': 'angelikafilmcenter.com/nyc' } soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date)) movie_names = [movie.text for movie in soup('h4', class_='name')] movie_statuses = [ first((cls for cls in d['class'] if cls.startswith('status'))) for d in soup('div', class_=re.compile('^status')) ] assert len(movie_names) == len( movie_statuses), f'{len(movie_names)} != {len(movie_statuses)}' # filter for currently playing only movie_names = [ m for m, status in zip(movie_names, movie_statuses) if not status.endswith('coming_soon') ] if not movie_names: return [], [] movie_datetimes = [[ DATETIME_SEP.join((date, time['value'])) for time in times('input', class_='showtime reserved-seating') ] for times in soup('div', class_="showtimes-wrapper")] movie_times = filter_past(movie_datetimes) # extract format from name, if any PATTERN = re.compile('in ((35|70)mm)$', re.I) def extract_fmt(m): m, *fmt = re.split(PATTERN, m)[:2] # only name and (35|70)mm, if any return m, ''.join(fmt).lower() # (cleaned) movie name, movie fmt movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names)) # annotate with format movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats)] movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_amc(theater, date): """Get movie names and times from AMC's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.amctheatres.com/movie-theatres/{}/{}/showtimes/all/{}/{}/all' D_THEATERS = { 'amc boston common': ('boston', 'amc-boston-common-19'), 'the waterfront': ('pittsburgh', 'amc-waterfront-22') } theaterplace, theatername = D_THEATERS[theater.lower()] soup = soup_me( BASE_URL.format(theaterplace, theatername, date, theatername)) movies = soup('div', class_='ShowtimesByTheatre-film') movie_names = [m.h2.text for m in movies] #soup('h2')] movie_datetimes = [ [ [ DATETIME_SEP.join((date, clean_time(time.text))) for time in times('div', class_='Showtime') if not time.find('div', { 'aria-hidden': "true" }).text == 'Sold Out' ] # TODO print sold-out times as xed-out ? for times in m('div', class_=re.compile('^Showtimes-Section Showtimes-Section')) ] for m in movies ] # flatten timelists for movies with multiple formats # TODO sometimes lists separate times for same format -- combine ? n_timelists_per_movie = [len(timelsts) for timelsts in movie_datetimes] movie_names = list( chain.from_iterable( [name] * n for name, n in zip(movie_names, n_timelists_per_movie))) movie_datetimes = flatten(movie_datetimes) movie_times = filter_past(movie_datetimes) # annotate with format movie_formats = [[fmt.text for fmt in m('h4')] for m in movies] movie_times = [ (times if fmt == 'Digital' or not times else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, flatten(movie_formats)) ] # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times)) # TODO combine does not know formats movie_names, movie_times = filter_movies(movie_names, movie_times) return movie_names, movie_times
def get_movies_moma(theater, date): """Get movie names and times from Museum of Modern Arts's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.moma.org/calendar/?utf8=%E2%9C%93&happening_filter=Films&date={}&location=both' soup = soup_me(BASE_URL.format(date)) relevant_movies = [ m for m in soup('div', class_='calendar-tile calendar-tile--tall-image') if date == convert_date(( m.find('div', class_='center balance-text').text.replace( u'\xa0', ' ') # -> " " .split(', ')[1])) # extract month & day from full datetime ] nested_movie_names = [ # list per showing.. some have multiple films [m.text for m in ms.h3('em')] if ms.h3('em') else [ms.h3.text] for ms in relevant_movies ] movie_names = [ms[-1] for ms in nested_movie_names ] # main attraction is the last film movie_formats = [ '+ {}'.format(','.join(ms[:-1])) if len(ms) > 1 else '' for ms in nested_movie_names ] PATTERN = re.compile('–[0-9]*:?[0-9]*') movie_datetimes = [ ( dparser.parse( re.sub( PATTERN, '', # remove any time ranges m.find('div', class_='center balance-text').text)).strftime( DATETIME_SEP.join(('%Y-%m-%d', '%l:%M%P'))) ) # yyyy-mm-dd @ hh:mm {a,p}m for m in relevant_movies ] movie_times = filter_past(movie_datetimes) # annotate with format movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats)] movie_names, movie_times = filter_movies(movie_names, movie_times) return movie_names, movie_times
def get_movies_nitehawk(theater, date): """Get movie names and times from Nitehawk's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://nitehawkcinema.com/{}/{}' D_THEATERS = { 'nitehawk': 'williamsburg', 'nitehawk prospect park': 'prospectpark' } soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()], date)) movie_names = [movie.text for movie in soup('div', class_='show-title')] if not movie_names: return [], [] # extract format from name, if any PATTERN = re.compile(' \(.*(DCP|(35|70)mm)\)$', re.I) def extract_fmt(m): m, *fmt = re.split(PATTERN, m)[:2] # only name and DCP / (35|70)mm, if any return m, ''.join(fmt).lower() # (cleaned) movie name, movie fmt movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names)) movie_datetimes = [ [ DATETIME_SEP.join(( date, clean_time(( t.contents[0] # ignore any junk after {a,p}m .strip().lower().replace( 'midnite', '11:59pm'))))) # else, wld be next day for t in times('a', class_='showtime') ] for times in soup('div', class_='showtimes-container clearfix') ] movie_times = filter_past(movie_datetimes) # annotate with format movie_times = [(times if fmt == 'dcp' or not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats)] # movie_names, movie_times = combine_times(*filter_movies(movie_names, movie_times)) movie_names, movie_times = filter_movies(movie_names, movie_times) return movie_names, movie_times
def get_movies_showtimes(theater, date): """Get movie names and times from Showtimes' website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.showtimes.com/movie-theaters/{}' D_THEATERS = { 'regal fenway': lambda *args: 'regal-fenway-stadium-13-rpx-6269', 'ua court st': lambda *args: 'ua-court-street-stadium-12-rpx-6608' } try: soup = soup_me( BASE_URL.format( D_THEATERS.get(theater.lower(), get_theaterpg_showtimes)( theater))) # fallback for unlisted theater # (phrased as functions, so theaterpg scraper won't run until necessary) movies = soup('li', class_='movie-info-box') except (Exception) as e: print(error_str.format(e)) # error msg only movies = [] # no matching theater movie_names = [ ''.join((re.sub('[\r\n].*', '', name.text.strip()) for name in m('h2', class_='media-heading'))) for m in movies ] nested_buttons = [ # [[day, time, time, day, time], ..] -> [[[day, time, time], [day, time]], ..] list( split_before((button.text for button in m('button', type='button')), lambda txt: ',' in txt)) for m in movies ] movie_datetimes = [ flatten([[ DATETIME_SEP.join((day.replace(':', ''), time)) for time in times ] for day, *times in buttons if (convert_date(day.replace(':', '')) == date)]) for buttons in nested_buttons ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_somerville(theater, date): """Get movie names and times from Somerville Theater's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://somervilletheatre.com/wp-content/themes/somerville/showtimes.xml' soup = soup_me(BASE_URL) movies = soup('filmtitle') movie_names = [m.shortname.text for m in movies] # /or/ m.find('name').text PATTERN = re.compile(' ((35|70)mm)$', re.I) def extract_fmt(m): m, *fmt = re.split(PATTERN, m)[:2] # only name and (35|70)mm, if any return m, ''.join(fmt).lower() # (cleaned) movie name, movie fmt movie_names, movie_formats = zip(*(extract_fmt(m) for m in movie_names)) convert = lambda date: date[-4:] + date[:-4] # mmddyyyy -> yyyymmdd movie_datetimes = [ [ ( dparser.parse(' '.join( (convert(d.text), t.text))) # yyyymmdd hhmm -> .strftime(DATETIME_SEP.join( ('%Y-%m-%d', '%l:%M%P')))) # yyyy-mm-dd @ hh:mm {a,p}m for d, t in zip(m('date'), m('time')) if d.text == convert_date(date, fmt_out='%m%d%Y') ] for m in movies ] movie_times = filter_past(movie_datetimes) # annotate with formats movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats)] movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_pghfilmmakers(theater, date): """Get movie names and times from Pittsburgh Filmmakers website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'http://cinema.pfpca.org/films/showtimes?location={}' D_THEATERS = { 'regent square theater': 24, 'harris theater': 20, 'melwood screening room': 18 } soup = soup_me(BASE_URL.format(D_THEATERS[theater.lower()])) # get date block try: block, = [ day for day in soup('caption') if day.text == convert_date(date, fmt_out='%a, %b %-d') ] except (ValueError): # indexing into empty list return [], [] movie_names = [ name.text for name in block.next.next.next('a', href=re.compile('/films/*')) ] movie_datetimes = [ DATETIME_SEP.join((date, div.next.next.next.text.strip())) for div in block.next.next.next( 'td', class_='views-field views-field-field-location') ] movie_times = filter_past(movie_datetimes) # filter movies with no future times # & combine times for same movie movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_quad(theater, date): """Get movie names and times from Quad's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://quadcinema.com/all/' soup = soup_me(BASE_URL) try: day, = [ d for d in soup('div', class_='now-single-day') if convert_date(d.h1.text) == date ] except (ValueError): # no matching date listed yet return [], [] movie_names = [movie.text for movie in day('h4')] movies = day('div', class_='single-listing') PATTERN = re.compile('^time') movie_datetimes = [[ DATETIME_SEP.join((date, time.text.replace('.', ':'))) for time in m('li', class_=PATTERN) ] for m in movies] movie_times = filter_past(movie_datetimes) ANTIPATTERN = re.compile('^[^(time)]') # non-showtime `li`s movie_formats = [[fmt.text for fmt in m('li', class_=ANTIPATTERN)] for m in movies] # annotate with formats movie_times = [(times if not times or not fmt else times + ['[ {} ]'.format(','.join(fmt))]) for times, fmt in zip(movie_times, movie_formats)] movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_syndicated(theater, date): """Get movie names and times from Syndicated's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://syndicatedbk.com/events/' soup = soup_me(BASE_URL) movie_strs = [ div.text.strip() for div in soup('div', id=re.compile(f'tribe-events-event-[0-9]*-{date}')) ] if not movie_strs or movie_strs[0].lower() == 'closed for private event': return [], [] matches = [ re.search(' \([0-9:]* [ap]m\)', movie_str, re.I) for movie_str in movie_strs ] movie_names = [ movie_str[:m.start(0)] # extract name for m, movie_str in zip(matches, movie_strs) ] movie_datetimes = [ DATETIME_SEP.join((date, time)) for time in ( movie_str[m.start(0) + 2:m.end(0) - 1] # extract time (while removing trailing " (" & ")") for m, movie_str in zip(matches, movie_strs)) ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_manor(theater, date): """Get movie names and times from The Manor's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://plugin.retrieverapi.com/getSchedule' PARAMS = {'date': date} headers = { 'Host': 'plugin.retrieverapi.com', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://plugin.retrieverapi.com/embed/4227729?print', 'Authorization': 'Basic NDIyNzcyOToxMjM=', 'DNT': '1', 'Connection': 'keep-alive' } djson = json_me(BASE_URL, PARAMS, headers=headers) movies = djson['movies'] movie_names = [m['movie_name'] for m in movies] movie_datetimes = [ [ (dparser.parse(show['date_time']).strftime( DATETIME_SEP.join( ('%Y-%m-%d', '%l:%M%P')))) # yyyy-mm-dd @ hh:mm {a,p}m for show in m['showtimes'] ] for m in movies ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_anthology(theater, date): """Get movie names and times from Anthology's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'http://anthologyfilmarchives.org/film_screenings/calendar?view=list' soup = soup_me(BASE_URL.format(date)) days = soup('h3', class_='current-day') try: iday = index_into_days([ ''.join((_ for _ in day.contents if isinstance(_, str))).strip() for day in days ], date=date) except (AssertionError): # no matching days return [], [] border = (days[iday + 1] if iday < len(days) - 1 else soup.find('div', id='footer')) next_movies = days[iday].find_all_next('div', class_='showing-details') prev_movies = border.find_all_previous('div', class_='showing-details') movies = list(set(next_movies) & set(prev_movies)) # get intersection b/w borders movie_names = [m.find('span', class_='film-title').text for m in movies] movie_datetimes = [[ DATETIME_SEP.join((date, time.text)) for time in movie('a', {'name': re.compile("^showing-")}) ] for movie in movies] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_cobble_hill(theater, date): """Get movie names and times from Cobble Hill Cinema's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://64785.formovietickets.com:2235/T.ASP?WCI=BT&Page=schedule&SelectedDate={}' soup = soup_me(BASE_URL.format(date.replace('-', ''))) movie_names = [m.text for m in soup('a', class_='displaytitle')] movie_datetimes = [[ DATETIME_SEP.join((date, time.text + 'm')) for time in m('a', class_='showtime') ] for m in soup('div', class_='showings')] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_coolidge(theater, date): """Get movie names and times from Coolidge Corner's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://coolidge.org/showtimes' PARAMS = {'date': date} soup = soup_me(BASE_URL, PARAMS) movies = soup('div', class_='film-card') movie_names = [m.h2.text for m in movies] movie_datetimes = [[ DATETIME_SEP.join((date, time.text)) for time in m('span', class_='showtime-ticket__time') ] for m in movies] movie_times = filter_past(movie_datetimes) PATTERN = re.compile('^film-program__title') is_relevant = lambda s: s.endswith('mm') movie_formats = [ ', '.join((tag.text for tag in m('span', class_=PATTERN) if is_relevant(tag.text))) for m in movies ] # annotate with format movie_times = [(times if not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats)] movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_bam(theater, date): """Get movie names and times from BAM Rose Cinema's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.bam.org/Filmsection' soup = soup_me(BASE_URL) relevant_movies = soup( 'div', {'data-sort-date': re.compile('^{}'.format(date.replace('-', '')))}) movie_names = [ m.find('div', class_='listModuleTitleMed listBlock').text for m in relevant_movies ] PATTERN = re.compile('[ap]m,?$', re.I) movie_sortedtimes = [ sorted( # not always time-ordered [time.text.strip().replace(',', '') for time in m('li')], key=lambda t: float(re.sub(PATTERN, '', t.replace(':', '.'))) ) # 7:40PM -> 7.4 for m in relevant_movies ] movie_datetimes = [[DATETIME_SEP.join((date, time)) for time in times] for times in movie_sortedtimes] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_rowhouse(theater, date): """Get movie names and times from Row House Cinema's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://rowhousecinema.com/{}' soup = soup_me(BASE_URL.format(date)) movies = soup('div', class_='showtimes-description') movie_names = [m.h2.text.strip() for m in movies] movie_datetimes = [[ DATETIME_SEP.join((date, time.text.strip())) for time in m('a', class_='showtime') ] for m in movies] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times
def get_movies_brattle(theater, date): """Get movie names and times from Brattle Theatre's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://www.brattlefilm.org/category/calendar-2' soup = soup_me(BASE_URL) # PATTERN = re.compile('^https://www.brattlefilm.org/{}'.format( # date.replace('-', '/'))) # relevant_movies = [ # movie for movie in soup('span', class_="calendar-list-item") # if movie('a', href=PATTERN)] PATTERN = re.compile('y{} m{} d{}'.format(*date.split('-'))) relevant_movies = soup('div', class_=PATTERN) VIRTUAL = 'category-virtual-programs' movie_names = [m.h2.text for m in relevant_movies] movie_formats_nested = [ # list of lists [ tag.replace('tag-', '') for tag in m['class'] if tag.startswith('tag-') or tag == VIRTUAL ] for m in relevant_movies ] movie_formats = [ # (filtered) list of strs ', '.join((fmt for fmt in fmts if not fmt.lower() == name.lower().replace(' ', '') )) # sometimes, tags are just the movie name.. for fmts, name in zip(movie_formats_nested, movie_names) ] relevant_movies, movie_formats, movie_names = zip( # filter `hidden` (e.g. cancelled series) *((m, fmt, name) for m, fmt, name in zip(relevant_movies, movie_formats, movie_names) if not 'hidden' in fmt)) if VIRTUAL not in movie_formats: # only last time is labeled explicitly -- assume rest are p.m. (unless already annotated) DEFAULT_TIME_OF_DAY = 'pm' PATTERN1 = re.compile('^([0-9: apm\.]*)', re.I) # capture time PATTERN2 = re.compile(f'([apm\.]+) ?{DEFAULT_TIME_OF_DAY}', re.I) # rm extraneous movie_datetimes = [ [ DATETIME_SEP.join(( date, re.sub( PATTERN2, r'\1', # 2. strip extraneous default (i.e. if already labeled) re.sub( PATTERN1, r'\1{}'. format( # 1. pad with default time just in case DEFAULT_TIME_OF_DAY), time)))) for time in m.li.text.replace('at ', '').split(',') ] for m in relevant_movies ] movie_times = filter_past(movie_datetimes) PATTERN1 = re.compile('^[0-9:]*((p|a)m)?') # time only PATTERN2 = re.compile('^[^a-z0-9]*(.*[a-z0-9])[^a-z0-9]*$' ) # string format only (e.g. no parens) # capture extra showing info movie_formats_extra = [ [ re.sub(PATTERN2, r'\1', re.sub(PATTERN1, '', t)) # extract dirty format, then clean for t in ts ] for ts in movie_times ] # .. & further clean times movie_times = [[re.match(PATTERN1, t).group(0) for t in ts] for ts in movie_times] # before possibly re-annotating (per-showtime) movie_times = [[ t if not fmt else t + f' [ {fmt} ]' for t, fmt in zip(ts, fmts) ] for ts, fmts in zip(movie_times, movie_formats_extra)] # annotate with (per-movie) format movie_times = [ (times if not times or not fmt else times + [f'[ {fmt} ]']) for times, fmt in zip(movie_times, movie_formats) ] movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) else: # strange days -- N.B. if one virtual, assume all virtual (for now) movie_times = [['virtual'] for _ in movie_names] return movie_names, movie_times
def convert(contentlst): date, _, timestr = contentlst start, end = timestr.split('–') return DATETIME_SEP.join((convert_date(date), start))
def get_movies_film_forum(theater, date): """Get movie names and times from Film Forum's website :theater: str :date: str (yyyy-mm-dd) (default: today) :returns: (list of movie names, list of lists of movie times) """ BASE_URL = 'https://filmforum.org/' soup = soup_me(BASE_URL, from_headless=True) # headers = { # 'Host': 'filmforum.org', # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en-US,en;q=0.5', # 'Accept-Encoding': 'gzip, deflate, br', # 'Cookie': 'exp_last_visit=1540095881; exp_last_activity=1541986743; prod_last_visit=1567621614; prod_last_activity=1567704700; visid_incap_2071502=8iHssZTnTnSmmcBr3w91Wt3MQ10AAAAAQUIPAAAAAACxMng+kgllZnm0qc4wuBX7; prod_tracker=%7B%220%22%3A%22index%22%2C%221%22%3A%22film%2Faga%22%2C%222%22%3A%22index%22%2C%22token%22%3A%220e8a94586278438a8abd9a2e22f6d71dc58ef797d480e691f6f7d52135be3b8604fc9bc72b9f98e33959ea6c363f6da7%22%7D; incap_ses_139_2071502=/FOyW/1BcEwESBAC4NjtAeH/b10AAAAAu5KRM62+voKYu930nS4qZA==; prod_csrf_token=add79bc2b230529b1baee4c15e4742a3599b154f; incap_ses_529_2071502=LeJGc8MKg19kn678pmNXB3xGcV0AAAAAk5FGgxjtbO141Wfk/d5SNg==', # 'DNT': '1', # 'Connection': 'keep-alive', # 'Upgrade-Insecure-Requests': '1', # 'Cache-Control': 'max-age=0, no-cache', # 'If-Modified-Since': 'Thu, 05 Sep 2019 17:31:41 GMT', # 'Pragma': 'no-cache' # } # soup_me(BASE_URL) # first request is blocked by ROBOTS # sleep(5) # soup = soup_me(BASE_URL, headers=headers) try: assert not soup.meta.attrs.get('name', '').lower() == 'robots', 'robots' except (AssertionError) as e: print(error_str.format(e)) # error msg only return [], [] # blocked from getting movies :( days = [ d.text for d in (soup.find('div', class_='sidebar-container').find_all('li')) ] iday = index_into_days(days, date=date) day = soup.find('div', id=f'tabs-{iday}') movie_names = [ ''.join( (txt for txt in mdiv.contents if isinstance(txt, str))).strip() # ignore txt in extra <span>s for mdiv in day('a', href=re.compile('^https://filmforum.org/film')) ] # N.B. could have modifier like "♪" after time PATTERN = re.compile('([0-9])\*?$') movie_datetimes = [ [ DATETIME_SEP.join( (date, re.sub(PATTERN, r'\1 pm', time.text))) # only AM is labeled explicitly for time in p('span', class_=None) ] for p in day('p') ] movie_times = filter_past(movie_datetimes) movie_names, movie_times = combine_times( *filter_movies(movie_names, movie_times)) return movie_names, movie_times