def get_random_proxy(): url = 'https://free-proxy-list.net/' response = requests.get(url) parser = fromstring(response.text) proxies = set() for i in parser.xpath('//tbody/tr')[:20]: if i.xpath('.//td[7][contains(text(),"yes")]'): proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) proxies.add(proxy) return choice(tuple(proxies))
def fetchHotelPrices(dateHotel): avg = [] final = pd.DataFrame([]) for start_date in dateHotel: lists = [] datetime_new = datetime.datetime.strptime(start_date, '%m/%d/%Y') end_date = datetime_new + timedelta(days=1) end_date = datetime.datetime.strftime(end_date, "%m/%d/%Y") url = "https://www.expedia.com/Hotel-Search?destination=london&startDate={}&endDate={}&adults=2&star=50,40&lodging=hotels".format( start_date, end_date) dates = datetime.datetime.strptime(start_date, '%m/%d/%Y') dates = dates.strftime('%Y-%m-%d') driver.get(url) sleep(20) parser = html.fromstring(driver.page_source, driver.current_url) hotels = parser.xpath(".//*[@id='resultsContainer']/section/article") for hotel in hotels: lowestPrice = hotel.xpath( "normalize-space(div[2]/div/div[1]/div[3]/div/div[1]/span/ul/li[@data-automation='actual-price']/span[2]/text())" ) if (lowestPrice is None or lowestPrice == ''): lowestPrice = hotel.xpath( "normalize-space(div[2]/div/div[1]/div[3]/div/div[1]/span/ul/li[@data-automation='actual-price']/a/text())" ) else: lowestPrice if (lowestPrice is None or lowestPrice == ''): continue lowestPrice = lowestPrice.replace("$", "").replace(",", "") lowestPrice = int(lowestPrice) lists.append(lowestPrice) if (len(lists) == 0): continue avgPrice = sum(lists) / len(lists) angDataFrame = pd.DataFrame({ 'Date': dates, 'HotelRate': avgPrice }, index=[0]) final = final.append(angDataFrame, ignore_index=True) return (final)
def fetchFlightFare(dateFlight): avg = [] final = pd.DataFrame([]) #carrierDetails = [{'origin':'doha','carrier':'QR'},{'origin':'istanbul','carrier':'TK'}] carrierDetails = [{'origin': 'doha', 'carrier': 'QR'}] for carriers in carrierDetails: source = carriers['origin'] destination = 'london' carrierFlight = carriers['carrier'] for start_date in dateFlight: lists = [] dates = datetime.datetime.strptime(start_date, '%m/%d/%Y') dates = dates.strftime('%Y-%m-%d') url = "https://www.expedia.com/Flights-Search?trip=oneway&leg1=from:{0},to:{1},departure:{2}TANYT&passengers=adults:1,children:0,seniors:0,infantinlap:Y&options=carrier:{3},cabinclass:economy,maxhops:0&mode=search&origref=www.expedia.com".format( source, destination, start_date, carrierFlight) driver.get(url) parser = html.fromstring(driver.page_source, driver.current_url) json_data_xpath = parser.xpath( "//script[@id='cachedResultsJson']//text()") raw_json = json.loads(json_data_xpath[0]) flight_data = json.loads(raw_json["content"]) for i in flight_data['legs'].keys(): exact_price = flight_data['legs'][i]['price'][ 'totalPriceAsDecimal'] lists.append(exact_price) if (len(lists) == 0): continue avgPrice = sum(lists) / len(lists) angDataFrame = pd.DataFrame( { 'Date': dates, 'FlightRate': avgPrice, 'Carrier': carrierFlight }, index=[0]) final = final.append(angDataFrame, ignore_index=True) return (final)
def _parse_review_data(asin, page_number=1): amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=recent&pageNumber=%s&filterByStar=all_stars' % ( asin, page_number) print amazon_url headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } page = requests.get(amazon_url, headers=headers) parser = html.fromstring(page.text) REVIEW_LIST = '//div[@data-hook="review"]' reviews_list = [] reviews = parser.xpath(REVIEW_LIST) for review in reviews: XPATH_ID = '@id' XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()' XPATH_REVIEW_POSTED_DATE = './/a[contains(@href,"/profile/")]/parent::span/following-sibling::span/text()' XPATH_REVIEW_TEXT = './/span[@data-hook="review-body"]//text()' XPATH_REVIEW_COMMENTS = './/span[@class="review-comment-total"]//text()' XPATH_AUTHOR = './/a[contains(@href,"/profile/")]/parent::span//text()' raw_review_id = review.xpath(XPATH_ID) raw_review_author = review.xpath(XPATH_AUTHOR) raw_review_rating = review.xpath(XPATH_RATING) raw_review_header = review.xpath(XPATH_REVIEW_HEADER) raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE) raw_review_text = review.xpath(XPATH_REVIEW_TEXT) raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS) # cleaning data id = ''.join(raw_review_id) rating = ''.join(raw_review_rating).replace('out of 5 stars', '') title = ' '.join(' '.join(raw_review_header).split()) content = ' '.join(' '.join(raw_review_text).split()) post_date = dateparser.parse( ''.join(raw_review_posted_date)).strftime('%d %b %Y') author = ' '.join( ' '.join(raw_review_author).split()).strip('By').strip() comment_count = ''.join(raw_review_comments) review_dict = { 'id': id, 'comment_count': comment_count, 'content': content, 'post_date': post_date, 'title': title, 'rating': rating, 'author': author } reviews_list.append(review_dict) return reviews_list
def _parse_total_review_count(asin): total_review_count = 0 amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=helpful&pageNumber=1&filterByStar=all_stars' % asin headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } page = requests.get(amazon_url, headers=headers) parser = html.fromstring(page.text) TOTAL_REVIEWS = '//span[@data-hook="total-review-count"]/text()' total_reviews = parser.xpath(TOTAL_REVIEWS) if total_reviews: total_review_count = int(total_reviews[0]) return total_review_count
def _parse_review_data_page(asin): total_pages = 1 amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=recent&pageNumber=1&filterByStar=all_stars' % asin headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } page = requests.get(amazon_url, headers=headers) parser = html.fromstring(page.text) PAGINATION_BAR = '//div[@id="cm_cr-pagination_bar"]' pagination_bar = parser.xpath(PAGINATION_BAR) if pagination_bar: PAGINATION_NUMBERS = './/li[@data-reftag="cm_cr_arp_d_paging_btm"]/a/text()' pagination_numbers = pagination_bar[0].xpath(PAGINATION_NUMBERS) if pagination_numbers: total_pages = int(pagination_numbers[-1]) return total_pages
def get_proxies(self, number_of_proxies): """Returns max 10 free https proxies by scraping free-proxy website. @arg number_of_proxies to be returned""" url = 'https://free-proxy-list.net/' response = requests.get(url) parser = fromstring(response.text) proxies = set() for i in parser.xpath('//tbody/tr'): if len(proxies) >= number_of_proxies: break if i.xpath('.//td[7][contains(text(),"yes")]'): #Grabbing IP and corresponding PORT proxy = ":".join([ i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0] ]) # Try to get google.com with the proxy to check if this proxy is ok. if self.valid_proxy(proxy): proxies.add(proxy) return proxies