Beispiel #1
0
	def get_optimal_departures(self):
		"""Return a set of sampled travel times which are as fast as possible, and 
		indifferent to route choice. This is the status quo."""
		# get all possible trips and any walking alternatives
		departures, all_trips, walk = [], [], None
		for itin in self.alter_itins():
			if not walk and itin.is_walking: 
				walk = itin
			else: # itin has transit
				all_trips.extend( itin.get_trips() )
		# if we have only walking, then all trips will be walking
		if walk and len(self.alter_itins()) == 1:
			return [ Departure(t,None,walk) for t in triptools.sample_times() ]
		# we now have only trips or trips and a walking option
		triptools.remove_premature_departures(all_trips)
		# ensure trips are sorted by departure, ASC
		optimal_trips = sorted(all_trips, key=lambda x: x.depart_ts)
		# iterate over sample moments looking for arrival of next-departing trip
		i = 0
		for time in triptools.sample_times():
			# move the trip index up to the present time if necessary
			while i < len(optimal_trips) and optimal_trips[i].depart <= time: i += 1
			# no trips left or walking better option
			if ( i >= len(optimal_trips) or (
				walk and (optimal_trips[i].arrive-time) > walk.walk_time
			) ):
				departures.append( Departure( time, None, walk ) )
			# have trip better than walking if that was available
			elif i < len(optimal_trips):
				departures.append( Departure( time, optimal_trips[i] ) )
			# no trip or attractive walking option
			else:
				departures.append( Departure( time ) )
		return departures
Beispiel #2
0
	def realtime_departures(self):
		"""Select an itinerary by trying to minimize the time before first 
		boarding a vehicle. Initial walking and waiting are treated indifferently. 
		From itineraries with identical departure times (due to shared first leg), 
		the one with the better mean travel time is chosen."""
		# get a big list of all possible trips, noting any end to end walking options	
		departures, all_trips, walk = [], [], None
		# for itineraries sorted in order of mean travel time:
		for itin in sorted(self.alter_itins(),key=lambda i: i.mean_travel_time):
			if not walk and itin.is_walking: walk = itin
			# extend right
			else: all_trips.extend( itin.get_trips() )
		# if we have only walking, then all trips will be walking
		if walk and len(self.alter_itins()) == 1:
			return [ Departure(t,None,walk) for t in triptools.sample_times() ]
		# we now have only trips or trips and a walking option
		# this is already sorted by mean itinerary travel time
		# now also (stably) sort by departure minus initial walk
		trips = sorted(all_trips, key=lambda t: t.first_boarding_time)
		# iterate over sample moments looking for arrival of next-departing trip
		i = 0
		for time in triptools.sample_times():
			# move the trip index up to the present time if necessary
			# there will be entries with identical departure times and this will 
			# take the first, which has an itinerary with a better mean travel time 
			while i < len(trips) and trips[i].depart <= time: i += 1
			# we still have trips
			if i < len(trips):
				# if no walking or trip is better
				if (not walk) or trips[i].first_boarding_time < time + walk.walk_time:
					departures.append( Departure( time, trips[i] ) )
				else: # walking is the better option
					departures.append( Departure( time, None, walk ) )
			# no trips left
			else: 
				departures.append( Departure( time, None, walk ) )
		return departures
Beispiel #3
0
	def departures(self):
		"""Departures in the time window using only this itinerary."""
		# pull it out of memory if we've already got this
		if not self.DB_departures:
			from triptools import sample_times
			from departure import Departure
			##############
			if self.is_walking: # all departures are the same
				self.DB_departures = [ 
					Departure(time,None,self) for time in sample_times() 
				]
			else: # trip based departures
				# get trips sorted (first to last) by departure
				trips = sorted( self.get_trips(), key=lambda t: t.depart_ts )
				self.DB_departures, i = [], 0
				for time in sample_times():
					# move the trip index up to the present time if necessary
					while i < len(trips) and trips[i].depart < time: 
						i += 1
					if i < len(trips): # we still have trips
						self.DB_departures.append( Departure(time,trips[i]) )
					else: # we've run out of trips
						self.DB_departures.append( Departure(time,None) )
		return self.DB_departures
Beispiel #4
0
	def habit_departures(self):
		"""Return a set of travel times over the time window for the assumption 
		that travellers consistently take the itinerary which minimizes mean 
		travel time."""
		habit_itin = None
		best_time = None
		# find the best mean travel time
		for itin in self.alter_itins():
			if ( (not best_time) or itin.mean_travel_time < best_time ):
				best_time = itin.mean_travel_time
				habit_itin = itin
		if habit_itin:
			return habit_itin.departures
		else:
			return [ Departure(time) for time in triptools.sample_times() ]
def main():
    def get_html(url, retry_count=0):
        try:
            res = requests.get(url)
            return res
        # except ConnectionResetError as e:
        except:
            print('Retry Count: {}'.format(retry_count))
            logger.error('Retry Count: {}'.format(retry_count))
            if retry_count >= 10:
                # raise e
                # print('Error')
                pass
            sleep(10)
            return get_html(url, retry_count + 1)

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')

    file_handler = logging.FileHandler(filename='update_competitor_data.log',
                                       mode='a')
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

    today = date.today()
    file_name = 'gate1_raw_data_{}.csv'.format(today.strftime("%m-%d-%y"))
    link_prefix = 'https://www.gate1travel.com'
    regions_US = []
    trips_US = []
    trips_set = set()
    trips = []
    error_log = dict()

    trip_continent = [{
        'continent_name': 'USA & Canada',
        'US_link': 'https://www.gate1travel.com/usa-canada?Brand=GATE1',
        'AU_link': ''
    }, {
        'continent_name': 'Latin America',
        'US_link': 'https://www.gate1travel.com/latin-america?Brand=GATE1',
        'AU_link': ''
    }]

    for continent in tqdm(trip_continent):

        if continent['US_link']:

            res = requests.get(continent['US_link'])
            soup = bs4.BeautifulSoup(res.text, 'lxml')

            trip_regions = soup.find_all('div', class_='region-thumbnail')

            for region in trip_regions:

                title = region.text.strip()
                link = link_prefix + region.find('a')['href']
                regions_US.append({'region_name': title, 'region_link': link})

    for region in tqdm(regions_US):

        driver = webdriver.Chrome()
        driver.get(region['region_link'])

        try:
            season_buttons = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, 'season-buttons-inner')))
            seasons = season_buttons.find_elements_by_class_name('btn')
            num_of_years = len(seasons)

            for year_num in range(num_of_years):

                try:
                    sleep(2)
                    season_buttons = driver.find_element_by_class_name(
                        'season-buttons-inner')
                    seasons = season_buttons.find_elements_by_class_name('btn')
                    sleep(2)
                    seasons[year_num].click()

                except ElementClickInterceptedException:
                    pass

                finally:
                    region_panel = driver.find_elements_by_class_name(
                        'panel-body')
                    num_of_regions = len(region_panel)

                    for region_num in range(num_of_regions):

                        region_panel = driver.find_elements_by_class_name(
                            'panel-body')
                        trip_panels = region_panel[
                            region_num].find_elements_by_class_name(
                                'Off-Season')
                        region_panel_soup = bs4.BeautifulSoup(
                            region_panel[region_num].get_attribute(
                                'innerHTML'), 'lxml')
                        trip_panels_soup = region_panel_soup.find_all('li')

                        for trip in trip_panels_soup:

                            trip_name = trip.find('a').text
                            trip_link = link_prefix + trip.find('a').get(
                                'href')
                            trips_US.append({
                                'trip_name': trip_name,
                                'trip_link': trip_link
                            })

                    sleep(2)

        finally:
            driver.quit()

    for trip in trips_US:
        trips_set.add(trip['trip_link'])

    for link in tqdm(trips_set):

        departures = []

        res = get_html(link)
        soup = bs4.BeautifulSoup(res.text, 'lxml')

        try:
            trip_name = soup.find("h2").text.strip()
            trip_code = 'Gate1{}'.format(
                link.split('.')[-2].split('-')[-1][:-2].upper())

            data_table = soup.find('table', class_='date-price-table')
            hidden_xs_items = data_table.find_all(class_='hidden-xs')
            year = data_table.find('th').text.split()[0][-2:]

            for hidden_xs_item in hidden_xs_items:

                table_rows = hidden_xs_item.find_all('tr')

                for row in table_rows:

                    if row.find(
                            class_='h4'
                    ):  # look for "YEAR Dates & Prices" if multiple years on same page
                        year = row.find('th').text.split()[0][-2:]
                    elif row.get('class') == ['pricerow'
                                              ]:  # look for departure row
                        departure = row

                        if departure.find(
                                'del', class_='text-muted'
                        ):  # check if date is crossed-off (Sold Out or Cancelled)
                            date_numbers = departure.find(
                                'del', class_='text-muted').text.split()
                            available = False
                        elif departure.find('button', class_='serviceDate'):
                            date_numbers = departure.find(
                                'button', class_='serviceDate').text.split()
                            available = True

                        if len(
                                date_numbers
                        ) == 3:  # check if date format includes day of week
                            departure_date = '{}-{}-20{}'.format(
                                date_numbers[2], date_numbers[1], year)
                        else:
                            departure_date = '{}-{}-20{}'.format(
                                date_numbers[1], date_numbers[0], year)

                        if departure.find('span', class_='text-danger'):
                            notes = departure.find('span',
                                                   class_='text-danger').text
                            if notes == '(Sold Out)':
                                status = 'Sold Out'
                            else:  # check if "Only x seats left!"
                                status = 'Limited'
                        else:
                            notes = ''
                            if available == False:
                                status = 'Cancelled'
                            else:
                                status = 'Available'

                        if departure.find('td', class_='bookby-price'):
                            prices = departure.find_all('td',
                                                        class_='text-center')
                            actual_price_usd = prices[0].text.strip().strip(
                                '*').replace(',', '')
                            original_price_usd = prices[1].text.strip().strip(
                                '*').replace(',', '')
                        else:
                            actual_price_usd = departure.find(
                                'td', class_='text-center').text.strip().strip(
                                    '*').replace(',', '')
                            original_price_usd = actual_price_usd

                        new_dep = Departure(
                            date=departure_date,
                            actual_price_usd=actual_price_usd,
                            original_price_usd=original_price_usd,
                            notes=notes,
                            status=status,
                            available=available)
                        departures.append(new_dep)

            new_trip = Trip(trip_name, trip_code, departures)
            trips.append(new_trip)

            sleep(5)

        except AttributeError:
            error_log['{} - US'.format(link)] = 'Missing from Website'
            logger.debug('{} - US - Missing from Website'.format(link))

    for trip in trips:
        trip.print_deps(file_name)

    print('\n\n*** Error Log ***')
    for code, error in error_log.items():
        print('{}: {}'.format(code, error))
    print('\n\n***           ***')

    print("\nGate1, Done!\n")
Beispiel #6
0
    def get_trip(url, retry_count=0):

        driver = webdriver.Chrome()
        driver.get(url)

        departures = []

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'h1')))
            sleep(1)
            years_dropdown = driver.find_element_by_class_name('dropdown')
            year_options = driver.find_elements_by_css_selector('option')
            num_of_years = len(year_options)

            trip_title_element = driver.find_element_by_css_selector('h1')
            trip_name = BeautifulSoup(
                trip_title_element.get_attribute('innerHTML'),
                'lxml').text.strip()
            trip_code = 'Cosmos{}'.format(
                driver.find_element_by_class_name('dph__subtitle').
                find_element_by_class_name('text-secondary-dark').text)

            for year_num in range(num_of_years):
                sleep(1)
                years_dropdown = driver.find_element_by_class_name('dropdown')
                years_dropdown.click()
                sleep(1)
                year_options = driver.find_elements_by_css_selector('option')
                year_option = year_options[year_num]
                year_option.click()
                sleep(1)

                departure_elements = driver.find_elements_by_class_name(
                    'dapm__departures')

                for departure_element in departure_elements:

                    departure = BeautifulSoup(
                        departure_element.get_attribute('innerHTML'), 'lxml')
                    date_numbers = departure.find(
                        'span', class_='dapm__date').text.split()
                    departure_date = '{:02}-{}-{}'.format(
                        int(date_numbers[0]), date_numbers[1][:3],
                        date_numbers[2])

                    if departure.find('div', class_='dapm__room-discount'):
                        actual_price_usd = departure.find(
                            'div',
                            class_='dapm__room-discount').text.strip().replace(
                                ',', '').replace('$', '')
                    else:
                        actual_price_usd = ''
                    if departure.find('span', class_='dapm__room-price'):
                        original_price_usd = departure.find(
                            'span',
                            class_='dapm__room-price').text.strip().replace(
                                ',', '').replace('$', '')
                    else:
                        original_price_usd = actual_price_usd

                    if departure.find('div',
                                      class_='small-group') and departure.find(
                                          'div', class_='popular'):
                        type = 'Small-Group Discovery & Popular'
                    elif departure.find('div', class_='small-group'):
                        type = 'Small-Group Discovery'
                    elif departure.find('div', class_='popular'):
                        type = 'Popular'
                    else:
                        type = ''

                    if departure.find('div', class_='dapm__seat-counter'):
                        notes = departure.find(
                            'div', class_='dapm__seat-counter').text.strip()
                    else:
                        notes = ''

                    dep_columns = departure.find_all('div',
                                                     class_='ng-star-inserted')

                    for dep_column in dep_columns:
                        if dep_column.text == ' sold out ':
                            status = 'Not Available'
                            available = False
                            break
                        else:
                            status = 'Available'
                            available = True

                    new_dep = Departure(date=departure_date,
                                        actual_price_usd=actual_price_usd,
                                        original_price_usd=original_price_usd,
                                        type=type,
                                        notes=notes,
                                        status=status,
                                        available=available)
                    departures.append(new_dep)

            return Trip(trip_name, trip_code, departures)

        except:
            driver.quit()

            if retry_count >= 5:
                error_log['{}'.format(url)] = 'Retry timeout'
                logger.debug('{} - Retry timeout'.format(url))
                return

            sleep(5)
            return get_trip(url, retry_count + 1)

        finally:
            driver.quit()
def main():

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')

    file_handler = logging.FileHandler(filename='update_competitor_data.log',
                                       mode='a')
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

    today = datetime.now()
    file_name = 'collette_raw_data_{}.csv'.format(today.strftime("%m-%d-%y"))
    link_prefix = 'https://www.gocollette.com'
    trips_US = []
    trips_AU = []
    trips = []
    error_log = dict()

    trip_continent = [{
        'region_name':
        'North America',
        'US_link':
        'https://www.gocollette.com/en/find-your-tour#q/continentnames=North%20America&currentPage=1&sortDirection=desc&sortBy=',
        'AU_link':
        'https://www.gocollette.com/en-au%2Ffind-your-tour%3Fsite%3Dcollette-au#q/continentnames=North%20America&currentPage=1&sortDirection=desc&sortBy='
    }, {
        'region_name':
        'South America',
        'US_link':
        'https://www.gocollette.com/en%2Ffind-your-tour%3Fsite%3Dcollette-us#q/continentnames=South%20America&currentPage=1&sortDirection=desc&sortBy=',
        'AU_link':
        'https://www.gocollette.com/en-au/find-your-tour#q/continentnames=South%20America&currentPage=1&sortDirection=desc&sortBy='
    }]

    for continent in trip_continent:

        driver = webdriver.Chrome()
        driver.get(continent['US_link'])

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'tour-body')))

            while driver.find_element_by_class_name(
                    'grey_block_arrow'
            ).text:  # check if 'VIEW MORE RESULTS' button present
                driver.find_element_by_class_name('grey_block_arrow').click()
                time.sleep(5)

            else:  # all trips have been loaded
                tours = driver.find_elements_by_class_name('tour-body')

                for tour in tours:

                    soup = BeautifulSoup(tour.get_attribute('innerHTML'),
                                         'lxml')

                    title = soup.find('h3', class_='tour-title').text.strip()

                    if soup.find('a', class_='bookNowButton'):
                        link = '{}{}'.format(
                            link_prefix,
                            soup.find('a', class_='bookNowButton').get('href'))
                    else:
                        link = ''

                    trips_US.append({'trip_name': title, 'link': link})

        finally:
            driver.quit()

    # for Error Log
    # trips_US = [
    #     {'trip_name':'', 'link':''},
    # ]

    for trip in tqdm(trips_US):

        if trip['link']:

            departures = []

            driver = webdriver.Chrome()
            driver.get(trip['link'])

            nameElement = driver.find_element_by_tag_name('h3')
            soup = BeautifulSoup(nameElement.get_attribute('innerHTML'),
                                 'lxml')

            trip_name = trip['trip_name']
            trip_code = trip['trip_name']

            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'date-group-dates')))

                monthElements = driver.find_elements_by_class_name(
                    'date-group-dates')
                for month in monthElements:

                    departureElements = month.find_elements_by_class_name(
                        'date-group')
                    for departure in departureElements:

                        soup = BeautifulSoup(
                            departure.get_attribute('innerHTML'), 'lxml')

                        date_numbers = soup.find('div',
                                                 class_='date').text.split()
                        departure_date = '{:02}-{}-{}'.format(
                            int(date_numbers[1].strip(',')), date_numbers[0],
                            date_numbers[2])

                        if soup.find('div', class_='danger'
                                     ):  # check for 'Only x seats remaining'
                            notes = soup.find('div',
                                              class_='danger').text.strip()
                            status = 'Limited'
                            type = ''
                        elif soup.find(
                                'div', class_='date-alert'
                        ):  # check if Cancelled, Guaranteed, or Sold Out
                            status = soup.find(
                                'div', class_='date-alert').text.strip()
                            if status == 'Call 800.340.5158 for details':
                                notes = status
                                status = 'Cancelled'
                                type = ''
                            elif status == 'Guaranteed':
                                notes = ''
                                type = status
                                status = 'Available'
                            elif re.search("Expires", status):
                                notes = status
                                status = 'Available'
                                type = ''
                            else:
                                notes = ''
                                type = ''
                        else:
                            notes = ''
                            status = 'Available'
                            type = ''

                        if status == 'Cancelled' or status == 'Sold Out':
                            available = False
                        else:
                            available = True

                        actual_price_usd = soup.find(
                            'span',
                            class_='discountedPrice').text.strip().replace(
                                ',', '')

                        if soup.find('span', class_='crossout'):
                            original_price_usd = soup.find(
                                'span',
                                class_='crossout').text.strip().replace(
                                    ',', '')
                        else:
                            original_price_usd = actual_price_usd

                        new_dep = Departure(
                            date=departure_date,
                            actual_price_usd=actual_price_usd,
                            original_price_usd=original_price_usd,
                            type=type,
                            notes=notes,
                            status=status,
                            available=available)
                        departures.append(new_dep)

            except TimeoutException:
                error_log['{} - US'.format(trip_code)] = 'Missing from Website'
                logger.error(
                    '{} - US - Missing from Website'.format(trip_code))

            finally:
                driver.quit()

            new_trip = Trip(trip_name, trip_code, departures)
            trips.append(new_trip)

        else:
            error_log['{} - US'.format(
                trip['trip_name'])] = 'Missing US \'Book Now\' link'
            logger.error('{} - US - Missing US \'Book Now\' link'.format(
                trip['trip_name']))

    for trip in trips:
        trip.print_deps(file_name)

    print('\n\n*** Error Log ***')
    for code, error in error_log.items():
        print('{}: {}'.format(code, error))
    print('\n\n***           ***')

    print("\nCollette, Done!\n")
Beispiel #8
0
    def get_trip(url, retry_count=0):
        
        driver = webdriver.Chrome()
        driver.get(url)

        departures = []

        try:
            
            trip_name_element = driver.find_element_by_tag_name('h1')
            trip_name_soup = bs4.BeautifulSoup(trip_name_element.get_attribute('innerHTML'), 'lxml')
            trip_name = trip_name_soup.contents[0].text.strip()
            trip_code = 'Tauck{}'.format(link.split('=')[1][:-4].upper())
            
            trip_list.append(trip_name)

            years_holder_element = driver.find_element_by_class_name('c-search-filters__section__content__years')
            years_elements = years_holder_element.find_elements_by_tag_name('span')
            num_of_years = len(years_elements)
            datepicker_button_element = driver.find_element_by_class_name('c-btn-primary-b.datepicker__button.theme--light')

            for year_num in range(num_of_years):
                
                datepicker_button_element.click()
                sleep(1)
                year_element = driver.find_element_by_class_name('c-search-filters__section__content__years').find_elements_by_tag_name('span')[year_num]
                year_element.click()
                year = driver.find_element_by_class_name('c-search-filters__section__content__years').find_elements_by_tag_name('span')[year_num].text
                sleep(1)
                
                calendar_element = driver.find_element_by_class_name('sheet__data.ani-y.ani-timing-a.ani--in')
                departure_elements = calendar_element.find_elements_by_class_name('sheet__data__wrapper')
                
                for departure_element in departure_elements:
                    
                    departure_data = departure_element.find_elements_by_class_name('data-label')

                    date_numbers = departure_data[0].get_attribute('innerHTML').split()
                    departure_date = '{:02}-{}-{}'.format(int(date_numbers[1]), date_numbers[0], year)

                    if departure_data[2].get_attribute('innerHTML'):
                        type = departure_data[2].get_attribute('innerHTML')
                    else:
                        type = ''

                    actual_price_usd = departure_data[4].get_attribute('innerHTML').replace(',', '').replace('$', '').split()[0]

                    notes = departure_data[5].get_attribute('innerHTML')

                    if notes == 'Soldout':
                        status = 'Sold Out'
                        available = False
                    elif notes == 'Not Available':
                        status = 'Cancelled'
                        available = False
                    elif notes == 'Limited':
                        status = 'Limited'
                        available = True
                    elif notes == 'Available':
                        status = 'Available'
                        available = True
                    else:
                        status = 'UNRECOGNIZED STATUS'
                        available = False

                    new_dep = Departure(date = departure_date, actual_price_usd = actual_price_usd, type = type, notes = notes, status = status, available = available)
                    departures.append(new_dep)
                
                datepicker_button_element.click()

            new_trip = Trip(trip_name, trip_code, departures)
            trips.append(new_trip)

        except NoSuchElementException:
            error_log['{}'.format(url)] = 'Bad Link'
            logger.debug('{} - Bad Link'.format(url))

        except:
            driver.quit()

            if retry_count >= 5:
                error_log['{}'.format(url)] = 'Retry timeout'
                logger.debug('{} - Retry timeout'.format(url))
                return

            sleep(5)
            return get_trip(url, retry_count + 1)
        
        finally:
            driver.quit()