Beispiel #1
0
def get_random_proxy():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:20]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return choice(tuple(proxies))
def fetchHotelPrices(dateHotel):

    avg = []
    final = pd.DataFrame([])

    for start_date in dateHotel:
        lists = []
        datetime_new = datetime.datetime.strptime(start_date, '%m/%d/%Y')
        end_date = datetime_new + timedelta(days=1)
        end_date = datetime.datetime.strftime(end_date, "%m/%d/%Y")

        url = "https://www.expedia.com/Hotel-Search?destination=london&startDate={}&endDate={}&adults=2&star=50,40&lodging=hotels".format(
            start_date, end_date)
        dates = datetime.datetime.strptime(start_date, '%m/%d/%Y')
        dates = dates.strftime('%Y-%m-%d')
        driver.get(url)
        sleep(20)
        parser = html.fromstring(driver.page_source, driver.current_url)

        hotels = parser.xpath(".//*[@id='resultsContainer']/section/article")

        for hotel in hotels:
            lowestPrice = hotel.xpath(
                "normalize-space(div[2]/div/div[1]/div[3]/div/div[1]/span/ul/li[@data-automation='actual-price']/span[2]/text())"
            )
            if (lowestPrice is None or lowestPrice == ''):

                lowestPrice = hotel.xpath(
                    "normalize-space(div[2]/div/div[1]/div[3]/div/div[1]/span/ul/li[@data-automation='actual-price']/a/text())"
                )
            else:
                lowestPrice

            if (lowestPrice is None or lowestPrice == ''):
                continue

            lowestPrice = lowestPrice.replace("$", "").replace(",", "")
            lowestPrice = int(lowestPrice)
            lists.append(lowestPrice)

        if (len(lists) == 0):
            continue

        avgPrice = sum(lists) / len(lists)

        angDataFrame = pd.DataFrame({
            'Date': dates,
            'HotelRate': avgPrice
        },
                                    index=[0])
        final = final.append(angDataFrame, ignore_index=True)

    return (final)
def fetchFlightFare(dateFlight):

    avg = []
    final = pd.DataFrame([])

    #carrierDetails = [{'origin':'doha','carrier':'QR'},{'origin':'istanbul','carrier':'TK'}]
    carrierDetails = [{'origin': 'doha', 'carrier': 'QR'}]

    for carriers in carrierDetails:
        source = carriers['origin']
        destination = 'london'
        carrierFlight = carriers['carrier']

        for start_date in dateFlight:
            lists = []

            dates = datetime.datetime.strptime(start_date, '%m/%d/%Y')
            dates = dates.strftime('%Y-%m-%d')

            url = "https://www.expedia.com/Flights-Search?trip=oneway&leg1=from:{0},to:{1},departure:{2}TANYT&passengers=adults:1,children:0,seniors:0,infantinlap:Y&options=carrier:{3},cabinclass:economy,maxhops:0&mode=search&origref=www.expedia.com".format(
                source, destination, start_date, carrierFlight)

            driver.get(url)

            parser = html.fromstring(driver.page_source, driver.current_url)

            json_data_xpath = parser.xpath(
                "//script[@id='cachedResultsJson']//text()")

            raw_json = json.loads(json_data_xpath[0])
            flight_data = json.loads(raw_json["content"])

            for i in flight_data['legs'].keys():
                exact_price = flight_data['legs'][i]['price'][
                    'totalPriceAsDecimal']

                lists.append(exact_price)

            if (len(lists) == 0):
                continue

            avgPrice = sum(lists) / len(lists)

            angDataFrame = pd.DataFrame(
                {
                    'Date': dates,
                    'FlightRate': avgPrice,
                    'Carrier': carrierFlight
                },
                index=[0])
            final = final.append(angDataFrame, ignore_index=True)
    return (final)
Beispiel #4
0
def _parse_review_data(asin, page_number=1):
    amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=recent&pageNumber=%s&filterByStar=all_stars' % (
        asin, page_number)
    print amazon_url
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
    page = requests.get(amazon_url, headers=headers)
    parser = html.fromstring(page.text)

    REVIEW_LIST = '//div[@data-hook="review"]'
    reviews_list = []
    reviews = parser.xpath(REVIEW_LIST)
    for review in reviews:
        XPATH_ID = '@id'
        XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
        XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
        XPATH_REVIEW_POSTED_DATE = './/a[contains(@href,"/profile/")]/parent::span/following-sibling::span/text()'
        XPATH_REVIEW_TEXT = './/span[@data-hook="review-body"]//text()'
        XPATH_REVIEW_COMMENTS = './/span[@class="review-comment-total"]//text()'
        XPATH_AUTHOR = './/a[contains(@href,"/profile/")]/parent::span//text()'
        raw_review_id = review.xpath(XPATH_ID)
        raw_review_author = review.xpath(XPATH_AUTHOR)
        raw_review_rating = review.xpath(XPATH_RATING)
        raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
        raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
        raw_review_text = review.xpath(XPATH_REVIEW_TEXT)
        raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)

        # cleaning data
        id = ''.join(raw_review_id)
        rating = ''.join(raw_review_rating).replace('out of 5 stars', '')
        title = ' '.join(' '.join(raw_review_header).split())
        content = ' '.join(' '.join(raw_review_text).split())
        post_date = dateparser.parse(
            ''.join(raw_review_posted_date)).strftime('%d %b %Y')
        author = ' '.join(
            ' '.join(raw_review_author).split()).strip('By').strip()
        comment_count = ''.join(raw_review_comments)

        review_dict = {
            'id': id,
            'comment_count': comment_count,
            'content': content,
            'post_date': post_date,
            'title': title,
            'rating': rating,
            'author': author
        }
        reviews_list.append(review_dict)
    return reviews_list
Beispiel #5
0
def _parse_total_review_count(asin):
    total_review_count = 0
    amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=helpful&pageNumber=1&filterByStar=all_stars' % asin
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
    page = requests.get(amazon_url, headers=headers)
    parser = html.fromstring(page.text)

    TOTAL_REVIEWS = '//span[@data-hook="total-review-count"]/text()'
    total_reviews = parser.xpath(TOTAL_REVIEWS)
    if total_reviews:
        total_review_count = int(total_reviews[0])
    return total_review_count
Beispiel #6
0
def _parse_review_data_page(asin):
    total_pages = 1
    amazon_url = 'https://www.amazon.co.uk/product-reviews/%s/ref=cm_cr_arp_d_viewopt_sr?ie=UTF8&reviewerType=all_reviews&showViewpoints=1&sortBy=recent&pageNumber=1&filterByStar=all_stars' % asin
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'
    }
    page = requests.get(amazon_url, headers=headers)
    parser = html.fromstring(page.text)

    PAGINATION_BAR = '//div[@id="cm_cr-pagination_bar"]'
    pagination_bar = parser.xpath(PAGINATION_BAR)
    if pagination_bar:
        PAGINATION_NUMBERS = './/li[@data-reftag="cm_cr_arp_d_paging_btm"]/a/text()'
        pagination_numbers = pagination_bar[0].xpath(PAGINATION_NUMBERS)
        if pagination_numbers:
            total_pages = int(pagination_numbers[-1])
    return total_pages
Beispiel #7
0
    def get_proxies(self, number_of_proxies):
        """Returns max 10 free https proxies by scraping 
        free-proxy website.
        @arg number_of_proxies to be returned"""

        url = 'https://free-proxy-list.net/'
        response = requests.get(url)
        parser = fromstring(response.text)
        proxies = set()
        for i in parser.xpath('//tbody/tr'):
            if len(proxies) >= number_of_proxies:
                break
            if i.xpath('.//td[7][contains(text(),"yes")]'):
                #Grabbing IP and corresponding PORT
                proxy = ":".join([
                    i.xpath('.//td[1]/text()')[0],
                    i.xpath('.//td[2]/text()')[0]
                ])
                # Try to get google.com with the proxy to check if this proxy is ok.
                if self.valid_proxy(proxy):
                    proxies.add(proxy)
        return proxies