def get_optimal_departures(self): """Return a set of sampled travel times which are as fast as possible, and indifferent to route choice. This is the status quo.""" # get all possible trips and any walking alternatives departures, all_trips, walk = [], [], None for itin in self.alter_itins(): if not walk and itin.is_walking: walk = itin else: # itin has transit all_trips.extend( itin.get_trips() ) # if we have only walking, then all trips will be walking if walk and len(self.alter_itins()) == 1: return [ Departure(t,None,walk) for t in triptools.sample_times() ] # we now have only trips or trips and a walking option triptools.remove_premature_departures(all_trips) # ensure trips are sorted by departure, ASC optimal_trips = sorted(all_trips, key=lambda x: x.depart_ts) # iterate over sample moments looking for arrival of next-departing trip i = 0 for time in triptools.sample_times(): # move the trip index up to the present time if necessary while i < len(optimal_trips) and optimal_trips[i].depart <= time: i += 1 # no trips left or walking better option if ( i >= len(optimal_trips) or ( walk and (optimal_trips[i].arrive-time) > walk.walk_time ) ): departures.append( Departure( time, None, walk ) ) # have trip better than walking if that was available elif i < len(optimal_trips): departures.append( Departure( time, optimal_trips[i] ) ) # no trip or attractive walking option else: departures.append( Departure( time ) ) return departures
def realtime_departures(self): """Select an itinerary by trying to minimize the time before first boarding a vehicle. Initial walking and waiting are treated indifferently. From itineraries with identical departure times (due to shared first leg), the one with the better mean travel time is chosen.""" # get a big list of all possible trips, noting any end to end walking options departures, all_trips, walk = [], [], None # for itineraries sorted in order of mean travel time: for itin in sorted(self.alter_itins(),key=lambda i: i.mean_travel_time): if not walk and itin.is_walking: walk = itin # extend right else: all_trips.extend( itin.get_trips() ) # if we have only walking, then all trips will be walking if walk and len(self.alter_itins()) == 1: return [ Departure(t,None,walk) for t in triptools.sample_times() ] # we now have only trips or trips and a walking option # this is already sorted by mean itinerary travel time # now also (stably) sort by departure minus initial walk trips = sorted(all_trips, key=lambda t: t.first_boarding_time) # iterate over sample moments looking for arrival of next-departing trip i = 0 for time in triptools.sample_times(): # move the trip index up to the present time if necessary # there will be entries with identical departure times and this will # take the first, which has an itinerary with a better mean travel time while i < len(trips) and trips[i].depart <= time: i += 1 # we still have trips if i < len(trips): # if no walking or trip is better if (not walk) or trips[i].first_boarding_time < time + walk.walk_time: departures.append( Departure( time, trips[i] ) ) else: # walking is the better option departures.append( Departure( time, None, walk ) ) # no trips left else: departures.append( Departure( time, None, walk ) ) return departures
def departures(self): """Departures in the time window using only this itinerary.""" # pull it out of memory if we've already got this if not self.DB_departures: from triptools import sample_times from departure import Departure ############## if self.is_walking: # all departures are the same self.DB_departures = [ Departure(time,None,self) for time in sample_times() ] else: # trip based departures # get trips sorted (first to last) by departure trips = sorted( self.get_trips(), key=lambda t: t.depart_ts ) self.DB_departures, i = [], 0 for time in sample_times(): # move the trip index up to the present time if necessary while i < len(trips) and trips[i].depart < time: i += 1 if i < len(trips): # we still have trips self.DB_departures.append( Departure(time,trips[i]) ) else: # we've run out of trips self.DB_departures.append( Departure(time,None) ) return self.DB_departures
def habit_departures(self): """Return a set of travel times over the time window for the assumption that travellers consistently take the itinerary which minimizes mean travel time.""" habit_itin = None best_time = None # find the best mean travel time for itin in self.alter_itins(): if ( (not best_time) or itin.mean_travel_time < best_time ): best_time = itin.mean_travel_time habit_itin = itin if habit_itin: return habit_itin.departures else: return [ Departure(time) for time in triptools.sample_times() ]
def main(): def get_html(url, retry_count=0): try: res = requests.get(url) return res # except ConnectionResetError as e: except: print('Retry Count: {}'.format(retry_count)) logger.error('Retry Count: {}'.format(retry_count)) if retry_count >= 10: # raise e # print('Error') pass sleep(10) return get_html(url, retry_count + 1) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s') file_handler = logging.FileHandler(filename='update_competitor_data.log', mode='a') file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) today = date.today() file_name = 'gate1_raw_data_{}.csv'.format(today.strftime("%m-%d-%y")) link_prefix = 'https://www.gate1travel.com' regions_US = [] trips_US = [] trips_set = set() trips = [] error_log = dict() trip_continent = [{ 'continent_name': 'USA & Canada', 'US_link': 'https://www.gate1travel.com/usa-canada?Brand=GATE1', 'AU_link': '' }, { 'continent_name': 'Latin America', 'US_link': 'https://www.gate1travel.com/latin-america?Brand=GATE1', 'AU_link': '' }] for continent in tqdm(trip_continent): if continent['US_link']: res = requests.get(continent['US_link']) soup = bs4.BeautifulSoup(res.text, 'lxml') trip_regions = soup.find_all('div', class_='region-thumbnail') for region in trip_regions: title = region.text.strip() link = link_prefix + region.find('a')['href'] regions_US.append({'region_name': title, 'region_link': link}) for region in tqdm(regions_US): driver = webdriver.Chrome() driver.get(region['region_link']) try: season_buttons = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, 'season-buttons-inner'))) seasons = season_buttons.find_elements_by_class_name('btn') num_of_years = len(seasons) for year_num in range(num_of_years): try: sleep(2) season_buttons = driver.find_element_by_class_name( 'season-buttons-inner') seasons = season_buttons.find_elements_by_class_name('btn') sleep(2) seasons[year_num].click() except ElementClickInterceptedException: pass finally: region_panel = driver.find_elements_by_class_name( 'panel-body') num_of_regions = len(region_panel) for region_num in range(num_of_regions): region_panel = driver.find_elements_by_class_name( 'panel-body') trip_panels = region_panel[ region_num].find_elements_by_class_name( 'Off-Season') region_panel_soup = bs4.BeautifulSoup( region_panel[region_num].get_attribute( 'innerHTML'), 'lxml') trip_panels_soup = region_panel_soup.find_all('li') for trip in trip_panels_soup: trip_name = trip.find('a').text trip_link = link_prefix + trip.find('a').get( 'href') trips_US.append({ 'trip_name': trip_name, 'trip_link': trip_link }) sleep(2) finally: driver.quit() for trip in trips_US: trips_set.add(trip['trip_link']) for link in tqdm(trips_set): departures = [] res = get_html(link) soup = bs4.BeautifulSoup(res.text, 'lxml') try: trip_name = soup.find("h2").text.strip() trip_code = 'Gate1{}'.format( link.split('.')[-2].split('-')[-1][:-2].upper()) data_table = soup.find('table', class_='date-price-table') hidden_xs_items = data_table.find_all(class_='hidden-xs') year = data_table.find('th').text.split()[0][-2:] for hidden_xs_item in hidden_xs_items: table_rows = hidden_xs_item.find_all('tr') for row in table_rows: if row.find( class_='h4' ): # look for "YEAR Dates & Prices" if multiple years on same page year = row.find('th').text.split()[0][-2:] elif row.get('class') == ['pricerow' ]: # look for departure row departure = row if departure.find( 'del', class_='text-muted' ): # check if date is crossed-off (Sold Out or Cancelled) date_numbers = departure.find( 'del', class_='text-muted').text.split() available = False elif departure.find('button', class_='serviceDate'): date_numbers = departure.find( 'button', class_='serviceDate').text.split() available = True if len( date_numbers ) == 3: # check if date format includes day of week departure_date = '{}-{}-20{}'.format( date_numbers[2], date_numbers[1], year) else: departure_date = '{}-{}-20{}'.format( date_numbers[1], date_numbers[0], year) if departure.find('span', class_='text-danger'): notes = departure.find('span', class_='text-danger').text if notes == '(Sold Out)': status = 'Sold Out' else: # check if "Only x seats left!" status = 'Limited' else: notes = '' if available == False: status = 'Cancelled' else: status = 'Available' if departure.find('td', class_='bookby-price'): prices = departure.find_all('td', class_='text-center') actual_price_usd = prices[0].text.strip().strip( '*').replace(',', '') original_price_usd = prices[1].text.strip().strip( '*').replace(',', '') else: actual_price_usd = departure.find( 'td', class_='text-center').text.strip().strip( '*').replace(',', '') original_price_usd = actual_price_usd new_dep = Departure( date=departure_date, actual_price_usd=actual_price_usd, original_price_usd=original_price_usd, notes=notes, status=status, available=available) departures.append(new_dep) new_trip = Trip(trip_name, trip_code, departures) trips.append(new_trip) sleep(5) except AttributeError: error_log['{} - US'.format(link)] = 'Missing from Website' logger.debug('{} - US - Missing from Website'.format(link)) for trip in trips: trip.print_deps(file_name) print('\n\n*** Error Log ***') for code, error in error_log.items(): print('{}: {}'.format(code, error)) print('\n\n*** ***') print("\nGate1, Done!\n")
def get_trip(url, retry_count=0): driver = webdriver.Chrome() driver.get(url) departures = [] try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, 'h1'))) sleep(1) years_dropdown = driver.find_element_by_class_name('dropdown') year_options = driver.find_elements_by_css_selector('option') num_of_years = len(year_options) trip_title_element = driver.find_element_by_css_selector('h1') trip_name = BeautifulSoup( trip_title_element.get_attribute('innerHTML'), 'lxml').text.strip() trip_code = 'Cosmos{}'.format( driver.find_element_by_class_name('dph__subtitle'). find_element_by_class_name('text-secondary-dark').text) for year_num in range(num_of_years): sleep(1) years_dropdown = driver.find_element_by_class_name('dropdown') years_dropdown.click() sleep(1) year_options = driver.find_elements_by_css_selector('option') year_option = year_options[year_num] year_option.click() sleep(1) departure_elements = driver.find_elements_by_class_name( 'dapm__departures') for departure_element in departure_elements: departure = BeautifulSoup( departure_element.get_attribute('innerHTML'), 'lxml') date_numbers = departure.find( 'span', class_='dapm__date').text.split() departure_date = '{:02}-{}-{}'.format( int(date_numbers[0]), date_numbers[1][:3], date_numbers[2]) if departure.find('div', class_='dapm__room-discount'): actual_price_usd = departure.find( 'div', class_='dapm__room-discount').text.strip().replace( ',', '').replace('$', '') else: actual_price_usd = '' if departure.find('span', class_='dapm__room-price'): original_price_usd = departure.find( 'span', class_='dapm__room-price').text.strip().replace( ',', '').replace('$', '') else: original_price_usd = actual_price_usd if departure.find('div', class_='small-group') and departure.find( 'div', class_='popular'): type = 'Small-Group Discovery & Popular' elif departure.find('div', class_='small-group'): type = 'Small-Group Discovery' elif departure.find('div', class_='popular'): type = 'Popular' else: type = '' if departure.find('div', class_='dapm__seat-counter'): notes = departure.find( 'div', class_='dapm__seat-counter').text.strip() else: notes = '' dep_columns = departure.find_all('div', class_='ng-star-inserted') for dep_column in dep_columns: if dep_column.text == ' sold out ': status = 'Not Available' available = False break else: status = 'Available' available = True new_dep = Departure(date=departure_date, actual_price_usd=actual_price_usd, original_price_usd=original_price_usd, type=type, notes=notes, status=status, available=available) departures.append(new_dep) return Trip(trip_name, trip_code, departures) except: driver.quit() if retry_count >= 5: error_log['{}'.format(url)] = 'Retry timeout' logger.debug('{} - Retry timeout'.format(url)) return sleep(5) return get_trip(url, retry_count + 1) finally: driver.quit()
def main(): logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s') file_handler = logging.FileHandler(filename='update_competitor_data.log', mode='a') file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) logger.addHandler(file_handler) today = datetime.now() file_name = 'collette_raw_data_{}.csv'.format(today.strftime("%m-%d-%y")) link_prefix = 'https://www.gocollette.com' trips_US = [] trips_AU = [] trips = [] error_log = dict() trip_continent = [{ 'region_name': 'North America', 'US_link': 'https://www.gocollette.com/en/find-your-tour#q/continentnames=North%20America¤tPage=1&sortDirection=desc&sortBy=', 'AU_link': 'https://www.gocollette.com/en-au%2Ffind-your-tour%3Fsite%3Dcollette-au#q/continentnames=North%20America¤tPage=1&sortDirection=desc&sortBy=' }, { 'region_name': 'South America', 'US_link': 'https://www.gocollette.com/en%2Ffind-your-tour%3Fsite%3Dcollette-us#q/continentnames=South%20America¤tPage=1&sortDirection=desc&sortBy=', 'AU_link': 'https://www.gocollette.com/en-au/find-your-tour#q/continentnames=South%20America¤tPage=1&sortDirection=desc&sortBy=' }] for continent in trip_continent: driver = webdriver.Chrome() driver.get(continent['US_link']) try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'tour-body'))) while driver.find_element_by_class_name( 'grey_block_arrow' ).text: # check if 'VIEW MORE RESULTS' button present driver.find_element_by_class_name('grey_block_arrow').click() time.sleep(5) else: # all trips have been loaded tours = driver.find_elements_by_class_name('tour-body') for tour in tours: soup = BeautifulSoup(tour.get_attribute('innerHTML'), 'lxml') title = soup.find('h3', class_='tour-title').text.strip() if soup.find('a', class_='bookNowButton'): link = '{}{}'.format( link_prefix, soup.find('a', class_='bookNowButton').get('href')) else: link = '' trips_US.append({'trip_name': title, 'link': link}) finally: driver.quit() # for Error Log # trips_US = [ # {'trip_name':'', 'link':''}, # ] for trip in tqdm(trips_US): if trip['link']: departures = [] driver = webdriver.Chrome() driver.get(trip['link']) nameElement = driver.find_element_by_tag_name('h3') soup = BeautifulSoup(nameElement.get_attribute('innerHTML'), 'lxml') trip_name = trip['trip_name'] trip_code = trip['trip_name'] try: WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, 'date-group-dates'))) monthElements = driver.find_elements_by_class_name( 'date-group-dates') for month in monthElements: departureElements = month.find_elements_by_class_name( 'date-group') for departure in departureElements: soup = BeautifulSoup( departure.get_attribute('innerHTML'), 'lxml') date_numbers = soup.find('div', class_='date').text.split() departure_date = '{:02}-{}-{}'.format( int(date_numbers[1].strip(',')), date_numbers[0], date_numbers[2]) if soup.find('div', class_='danger' ): # check for 'Only x seats remaining' notes = soup.find('div', class_='danger').text.strip() status = 'Limited' type = '' elif soup.find( 'div', class_='date-alert' ): # check if Cancelled, Guaranteed, or Sold Out status = soup.find( 'div', class_='date-alert').text.strip() if status == 'Call 800.340.5158 for details': notes = status status = 'Cancelled' type = '' elif status == 'Guaranteed': notes = '' type = status status = 'Available' elif re.search("Expires", status): notes = status status = 'Available' type = '' else: notes = '' type = '' else: notes = '' status = 'Available' type = '' if status == 'Cancelled' or status == 'Sold Out': available = False else: available = True actual_price_usd = soup.find( 'span', class_='discountedPrice').text.strip().replace( ',', '') if soup.find('span', class_='crossout'): original_price_usd = soup.find( 'span', class_='crossout').text.strip().replace( ',', '') else: original_price_usd = actual_price_usd new_dep = Departure( date=departure_date, actual_price_usd=actual_price_usd, original_price_usd=original_price_usd, type=type, notes=notes, status=status, available=available) departures.append(new_dep) except TimeoutException: error_log['{} - US'.format(trip_code)] = 'Missing from Website' logger.error( '{} - US - Missing from Website'.format(trip_code)) finally: driver.quit() new_trip = Trip(trip_name, trip_code, departures) trips.append(new_trip) else: error_log['{} - US'.format( trip['trip_name'])] = 'Missing US \'Book Now\' link' logger.error('{} - US - Missing US \'Book Now\' link'.format( trip['trip_name'])) for trip in trips: trip.print_deps(file_name) print('\n\n*** Error Log ***') for code, error in error_log.items(): print('{}: {}'.format(code, error)) print('\n\n*** ***') print("\nCollette, Done!\n")
def get_trip(url, retry_count=0): driver = webdriver.Chrome() driver.get(url) departures = [] try: trip_name_element = driver.find_element_by_tag_name('h1') trip_name_soup = bs4.BeautifulSoup(trip_name_element.get_attribute('innerHTML'), 'lxml') trip_name = trip_name_soup.contents[0].text.strip() trip_code = 'Tauck{}'.format(link.split('=')[1][:-4].upper()) trip_list.append(trip_name) years_holder_element = driver.find_element_by_class_name('c-search-filters__section__content__years') years_elements = years_holder_element.find_elements_by_tag_name('span') num_of_years = len(years_elements) datepicker_button_element = driver.find_element_by_class_name('c-btn-primary-b.datepicker__button.theme--light') for year_num in range(num_of_years): datepicker_button_element.click() sleep(1) year_element = driver.find_element_by_class_name('c-search-filters__section__content__years').find_elements_by_tag_name('span')[year_num] year_element.click() year = driver.find_element_by_class_name('c-search-filters__section__content__years').find_elements_by_tag_name('span')[year_num].text sleep(1) calendar_element = driver.find_element_by_class_name('sheet__data.ani-y.ani-timing-a.ani--in') departure_elements = calendar_element.find_elements_by_class_name('sheet__data__wrapper') for departure_element in departure_elements: departure_data = departure_element.find_elements_by_class_name('data-label') date_numbers = departure_data[0].get_attribute('innerHTML').split() departure_date = '{:02}-{}-{}'.format(int(date_numbers[1]), date_numbers[0], year) if departure_data[2].get_attribute('innerHTML'): type = departure_data[2].get_attribute('innerHTML') else: type = '' actual_price_usd = departure_data[4].get_attribute('innerHTML').replace(',', '').replace('$', '').split()[0] notes = departure_data[5].get_attribute('innerHTML') if notes == 'Soldout': status = 'Sold Out' available = False elif notes == 'Not Available': status = 'Cancelled' available = False elif notes == 'Limited': status = 'Limited' available = True elif notes == 'Available': status = 'Available' available = True else: status = 'UNRECOGNIZED STATUS' available = False new_dep = Departure(date = departure_date, actual_price_usd = actual_price_usd, type = type, notes = notes, status = status, available = available) departures.append(new_dep) datepicker_button_element.click() new_trip = Trip(trip_name, trip_code, departures) trips.append(new_trip) except NoSuchElementException: error_log['{}'.format(url)] = 'Bad Link' logger.debug('{} - Bad Link'.format(url)) except: driver.quit() if retry_count >= 5: error_log['{}'.format(url)] = 'Retry timeout' logger.debug('{} - Retry timeout'.format(url)) return sleep(5) return get_trip(url, retry_count + 1) finally: driver.quit()