Esempio n. 1
0
    def set_proxy(self, sc_obj):
        proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

        auth_str = "{}:{}".format(proxy_user, proxy_pass)
        proxy = Proxy(proxy_ip, proxy_port, auth_str)

        sc_obj.proxy_manager.session_proxy = proxy
def start_scraping(threads_number):
    global config
    website_type = config.CLASS_TYPE_TROPICAIR

    global_sc_obj = Scraper(
        use_cache=False, #enable cache globally
        retries=3, 
        use_default_logging = False
    )

    tropicair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)
            
            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)

    except Exception as e:
        print (e)
        return

    sc_obj_list = []

    for i in range(0, threads_number):
        sc_obj = Scraper(
            use_cache=False, #enable cache globally
            retries=3, 
            timeout=60,
            use_default_logging = False
            )
        sc_obj_list.append(sc_obj)

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print ('None depart arrival info')
        return

    filename = "{}.csv".format(common_lib.get_webiste_str(website_type))
    for i, depart_arrival_info in enumerate(depart_arrival_list):
        threads = []

        currentdate = datetime.now(tz)
        print ("Current Date & Time: {} , {}".format(currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure =  depart_arrival_info["Departure"]
        arrival =  depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0

        departure_abbr =  departure.split("-")[1].strip()
        arrival_abbr = arrival.split("-")[1].strip()
    
        
        # for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
        #     date_list.append({"date":datetime.now(tz) + timedelta(days=step), "status":"none", "error_count":0})

        date_list = date_thread_list(threads_number)

        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None
                
                bStop = True
                for date in date_list:
                    if date["status"] != "complete":
                        bStop = False

                    if date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if bStop == True:
                    break

                if start_date == None:
                    continue

                print ("++++++++++++++++++++++++++++++")
                print ("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i))
                # print (depart_arrival_info)
                print (departure_abbr + "," + arrival_abbr)
                print ("++++++++++++++++++++++++++++++")

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()
                
                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                s = sc_obj_list[len(date_list) % threads_number]
                s.proxy_manager.session_proxy = proxy

                class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, 
                    departure_abbr, arrival_abbr)
                
                thread_obj = threading.Thread(target=class_obj.parse_website,
                                              args=(config.DRIVER_VALUE_PHANTOMJS,))
                                            # args=(config.DRIVER_VALUE_CHROME,))

                threads.append(thread_obj)
                thread_obj.start()
            
            for thread in threads:
                if not thread.is_alive():
                    
                    thread.join()
                    threads.remove(thread)

            
        # filename = "{}_{}_{}_{}.csv".format(common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H'))
        
        no_result = 0
        for item in date_list:
            no_result += item["no_result"] 
        
        stopdate = datetime.now(tz)
        print ("Finish Date & Time: {} , {}".format(stopdate.strftime('%Y-%m-%d'), stopdate.strftime('%H:%M')))

        global_sc_obj.save([
            "Departure", departure,
            "Arrival", arrival,
            "No Result", no_result,
            "File Name", filename,
            "Start", currentdate.strftime('%Y-%m-%d %H:%M'),
            "Finish", stopdate.strftime('%Y-%m-%d %H:%M')
            ], "output/output_{}.csv".format(website_type))

        print ( "*************************")
        # break

    try:
        common_lib.upload_file(filename, "output/")
        print "Upload"
    except:
        print ( "Error while upload :" + filename)
Esempio n. 3
0
    def parse_website(self, selenium_driver_type):
        self.class_type = config.CLASS_TYPE_MAYAISLANDAIR_STR

        self.parent_url = "https://booking.mayaislandair.com/VARS/Public/CustomerPanels/Requirements.aspx"

        try:
            no_result = 0
            stop_day = ""

            for ind in range(0, len(self.date_list["date"])):
                date_item = self.date_list["date"][ind]
                print "Start Date", date_item
                print "No Result=", no_result

                self.start_date = {
                    "date": date_item,
                    "status": "none",
                    "error_count": 0
                }
                self.end_date = {
                    'date': self.start_date["date"] + timedelta(days=1),
                    'status': 'pending'
                }

                stop_day = date_item
                if no_result >= config.MAX_NO_RESULT_COUNT:
                    print("--------------------------")
                    print("No result any more")
                    print("--------------------------")
                    break

                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                self.scrape_obj.proxy_manager.session_proxy = proxy
                self.scrape_obj.clear_cookies()
                bStop = False
                while bStop == False:

                    print('loading parent page... MayaislandAir')
                    html = self.scrape_obj.load(self.parent_url,
                                                use_cache=False)
                    session_id = html.x("//input[@id='VarsSessionID']/@value"
                                        ).encode('utf8').strip()

                    start_date_str = self.start_date["date"].strftime(
                        '%d-%b-%Y')
                    end_date_str = self.end_date["date"].strftime('%d-%b-%Y')

                    print("++++++++++++++++++++++")
                    print("Start Date = {}".format(start_date_str))
                    print("End Date = {}".format(end_date_str))
                    print("++++++++++++++++++++++")

                    payload = {}
                    form_data = {}
                    form_data["Origin"] = [self.departure_abbr]
                    form_data["VarsSessionID"] = session_id
                    form_data["Destination"] = [self.arrival_abbr]
                    form_data["DepartureDate"] = [start_date_str]
                    form_data["ReturnDate"] = [end_date_str]
                    form_data["Adults"] = "1"
                    form_data["Children"] = "0"
                    form_data["SmallChildren"] = 0
                    form_data["Seniors"] = "0"
                    form_data["Students"] = "0"
                    form_data["Infants"] = "0"
                    form_data["Youths"] = "0"
                    form_data["Teachers"] = "0"
                    form_data["SeatedInfants"] = "0"
                    form_data["EVoucher"] = ""
                    form_data["SearchUser"] = "******"
                    form_data["SearchSource"] = "refine"

                    payload["FormData"] = form_data
                    payload["IsMMBChangeFlightMode"] = False

                    url = "https://booking.mayaislandair.com/VARS/Public/WebServices/AvailabilityWS.asmx/GetFlightAvailability?VarsSessionID={}".format(
                        session_id)

                    headers = {
                        "Content-Type": "application/json",
                        #"accept-encoding": "gzip, deflate",
                        #"accept-language": "en-US,en;q=0.8",
                        #"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36",
                    }

                    payload = json.dumps(payload)

                    # print url
                    # print payload

                    print('loading get availability page... MayaislandAir')
                    self.scrape_obj.load(url,
                                         post=payload,
                                         headers=headers,
                                         use_cache=False)

                    url = "https://booking.mayaislandair.com/VARS/Public/FlightSelect.aspx"
                    payload = {}
                    payload["VarsSessionID"] = session_id

                    print('loading get info page... MayaislandAir')
                    html = self.scrape_obj.load(url,
                                                post=payload,
                                                use_cache=False)

                    # with open("response.html", 'w') as f:
                    #     f.write(html.encode('utf-8'))
                    print("Call Parse Round Trip Function")

                    departure_fare_id = ""
                    arrival_fare_id = ""
                    saved_item_list = []

                    self.wait()
                    departure_fare_id, arrival_fare_id, saved_item_list = self.parse_round_trip(
                        html)

                    print("Departure ID= {}".format(departure_fare_id))
                    print("Arrival ID= {}".format(arrival_fare_id))

                    if departure_fare_id != "" and arrival_fare_id != "":
                        url = "https://booking.mayaislandair.com/vars/public/WebServices/AvailabilityWS.asmx/AddFlightToBasket?VarsSessionID={}".format(
                            session_id)

                        payload = {}
                        form_data = {}

                        form_data["VarsSessionID"] = session_id
                        form_data["fareData"] = [
                            departure_fare_id, arrival_fare_id
                        ]
                        form_data["Zone"] = "PUBLIC"
                        payload["addFlightRequest"] = form_data
                        payload = json.dumps(payload)

                        headers = {
                            "Content-Type": "application/json",
                        }

                        self.wait()
                        json_value = self.scrape_obj.load_json(url,
                                                               post=payload,
                                                               use_cache=False,
                                                               headers=headers)
                        html_content = Doc(html=json_value["d"]["Data"])
                        total_price_str = html_content.x(
                            "//td[@class='BasketGrandTotalPrice']").strip()

                        try:
                            total_fare = re.search("([0-9.]+)",
                                                   total_price_str,
                                                   re.I | re.S | re.M).group(1)
                            currency = re.search("([A-Z]+)", total_price_str,
                                                 re.I | re.S | re.M).group(1)
                        except:
                            self.start_date["error_count"] += 1
                            if self.start_date[
                                    "error_count"] >= config.MAYAISLAND_SCRAPING_MAX_COUNT:
                                self.start_date["status"] = "complete"
                            else:
                                self.start_date["status"] = "none"

                            return

                        item_list = []
                        saved_time = datetime.now(
                            self.tz).strftime('%Y-%m-%d %H:%M')
                        for save_item in saved_item_list:

                            item = [
                                "Search Start Date",
                                save_item["Search_Start_Date"],
                                "Search End Date",
                                save_item["Search_End_Date"], "Departure Date",
                                save_item["Departure_Date"], "Origin",
                                save_item["Origin"], "Destination",
                                save_item["Destination"], "Leave Time",
                                save_item["Leave_Time"], "Arrive Time",
                                save_item["Arrive_Time"], "Duration",
                                save_item["Duration"], "Flight Number",
                                save_item["Flight_Number"], "Airline Fare",
                                save_item["Fare"], "Fare", total_fare,
                                "Airline", self.class_type, "Currency",
                                currency, "Capture Time", saved_time
                            ]

                            item_list.append(item)

                        self.save_item(item_list)

                        self.start_date["status"] = "complete"
                    else:
                        print("***************ERROR**************")
                        print(self.start_date)
                        print("***************ERROR**************")
                        self.start_date["error_count"] += 1
                        if self.start_date[
                                "error_count"] >= config.MAYAISLAND_SCRAPING_MAX_COUNT:
                            self.start_date["status"] = "complete"
                            self.date_list["no_result"] += 1
                            no_result += 1
                        else:
                            self.start_date["status"] = "none"

                    if self.start_date["status"] == "complete":
                        bStop = True

            start_day = self.date_list["date"][0]
            try:
                end_day = self.date_list["date"][-1]
            except:
                end_day = ""

            self.date_list["status"] = "complete"

            lock.acquire()
            global_sc_obj.save([
                "Departure",
                self.departure,
                "Arrival",
                self.arrival,
                "Start",
                start_day.strftime("%Y-%m-%d"),
                "End",
                end_day.strftime("%Y-%m-%d"),
                "Stop",
                stop_day.strftime("%Y-%m-%d"),
                "No Result",
                no_result,
            ], "export_{}.csv".format(self.class_type))

            lock.release()
        except Exception as e:
            print e
def start_scraping(threads_number, website_type):
    global config

    global_sc_obj = Scraper(
        use_cache=False,  #enable cache globally
        retries=3,
    )

    logger = global_sc_obj.logger

    tropicair_depart_arrival_list = []
    mayaislandair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)

            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_MAYAISLANDAIR_STR:
                        mayaislandair_depart_arrival_list.append(obj)
                    elif obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)
                    else:
                        raise Exception("Invalid content in relatin csv file")

    except Exception as e:
        print(e)
        return

    sc_obj_list = []

    for i in range(0, threads_number):
        sc_obj = Scraper(
            use_cache=False,  #enable cache globally
            retries=3,
            timeout=300,
            #log_file='logs/{}_log_{}.txt'.format(website_type, i)
        )
        sc_obj_list.append(sc_obj)

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = []
    if website_type == config.CLASS_TYPE_MAYAISLANDAIR:
        depart_arrival_list = mayaislandair_depart_arrival_list
    elif website_type == config.CLASS_TYPE_TROPICAIR:
        depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print('None depart arrival info')
        return

    #depart_arrival_list = [depart_arrival_list[0]]
    threads = []

    for i, depart_arrival_info in enumerate(depart_arrival_list):

        currentdate = datetime.now(tz)
        print("Current Date & Time: {} , {}".format(
            currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure = depart_arrival_info["Departure"]
        arrival = depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0
        if website_type == config.CLASS_TYPE_MAYAISLANDAIR:
            departure_abbr = re.search("\((.*?)\)", departure,
                                       re.I | re.S | re.M).group(1).strip()
            arrival_abbr = re.search("\((.*?)\)", arrival,
                                     re.I | re.S | re.M).group(1).strip()
            start_step = 1  ## This website not scraping today data, so start with +1
        elif website_type == config.CLASS_TYPE_TROPICAIR:
            departure_abbr = departure.split("-")[1].strip()
            arrival_abbr = arrival.split("-")[1].strip()

        date_list = []

        no_result_info = {"Count": 0}

        for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
            date_list.append({
                "date": datetime.now(tz) + timedelta(days=step),
                "status": "none",
                "error_count": 0
            })

        start_date_str = ""
        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None

                if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT:
                    print("--------------------------")
                    print("No result any more")
                    print("--------------------------")
                    break

                for date in date_list:
                    if date["status"] == "complete":
                        # print ("Remove Date")
                        # print (date)

                        date_list.remove(date)
                    elif date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if len(date_list) == 0:
                    break

                if start_date == None:
                    continue

                print("++++++++++++++++++++++++++++++")
                print("Depart List = " + str(len(depart_arrival_list)) +
                      " Index =" + str(i))
                # print (depart_arrival_info)
                print(departure_abbr + "," + arrival_abbr)
                print(start_date)
                print("++++++++++++++++++++++++++++++")

                start_date_str = start_date["date"].strftime('%Y-%m-%d')

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                s = sc_obj_list[len(date_list) % threads_number]
                s.proxy_manager.session_proxy = proxy

                class_obj = None
                if website_type == config.WEBSITE_TYPE_MAYAISLANDAIR:
                    class_obj = MayaislandAir(s, start_date, departure,
                                              arrival, currentdate, tz,
                                              departure_abbr, arrival_abbr,
                                              no_result_info)
                else:
                    class_obj = TropicAir(s, start_date, departure, arrival,
                                          currentdate, tz, departure_abbr,
                                          arrival_abbr, no_result_info)

                thread_obj = threading.Thread(
                    target=class_obj.parse_website,
                    args=(config.DRIVER_VALUE_PHANTOMJS, ))
                # args=(config.DRIVER_VALUE_CHROME,))

                threads.append(thread_obj)
                thread_obj.start()

            for thread in threads:
                if not thread.is_alive():

                    thread.join()
                    threads.remove(thread)

        print("*************************")
        print(len(date_list))
        print(no_result_info)
        filename = "{}_{}_{}_{}.csv".format(
            common_lib.get_webiste_str(website_type), departure_abbr,
            arrival_abbr, currentdate.strftime('%Y-%b-%d %H'))
        try:
            #common_lib.upload_file(filename, "output/")
            print "Upload"
        except:
            print("Error while upload :" + filename)

        global_sc_obj.save([
            "Departure", departure, "Arrival", arrival, "Date Len",
            len(date_list), "No Result", no_result_info["Count"], "File Name",
            filename, "Start Date", start_date_str
        ], "export_{}.csv".format(website_type))

        print("*************************")
Esempio n. 5
0
def start_scraping(threads_number):
    global config
    website_type = config.CLASS_TYPE_TROPICAIR

    global_sc_obj = Scraper(
        use_cache=False,  #enable cache globally
        retries=3,
        use_default_logging=False)

    tropicair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)

            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)

    except Exception as e:
        print(e)
        return

    sc_obj_list = []
    driver_list = []

    for i in range(0, threads_number):
        driver, user_agent, proxy, screen_resolution = common_lib.create_phantomjs_driver(
        )  # PHANTOMJS PART
        driver_list.append({"driver": driver, "status": "none"})

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print('None depart arrival info')
        return

    threads = []

    file_currentdate = datetime.now(tz)
    filename = "{}_{}.csv".format(common_lib.get_webiste_str(website_type),
                                  file_currentdate.strftime('%Y-%m-%d %H:%M'))

    for i, depart_arrival_info in enumerate(depart_arrival_list):
        currentdate = datetime.now(tz)
        print("Current Date & Time: {} , {}".format(
            currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure = depart_arrival_info["Departure"]
        arrival = depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0
        if website_type == config.CLASS_TYPE_TROPICAIR:
            departure_abbr = departure.split("-")[1].strip()
            arrival_abbr = arrival.split("-")[1].strip()

        date_list = []

        no_result_info = {"Count": 0}

        for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
            date_list.append({
                "date": datetime.now(tz) + timedelta(days=step),
                "status": "none",
                "error_count": 0
            })

        stop_date_str = ""
        start_date_str = currentdate.strftime('%Y-%m-%d %H:%M')

        print "************************************"
        print len(date_list), departure, arrival
        print "************************************"

        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None
                phantom_obj = None

                if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT:
                    print("--------------------------")
                    print("No result any more")
                    print("--------------------------")
                    break

                # print "+++++++++++++++++++++++++++++++++"
                # print driver_list
                # print "+++++++++++++++++++++++++++++++++"

                for date in date_list:
                    if date["status"] == "complete":
                        date_list.remove(date)
                    elif date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if len(date_list) == 0:
                    break

                if start_date == None:
                    continue

                for driver in driver_list:
                    if driver["status"] == "none":
                        phantom_obj = driver
                        driver["status"] = "pending"
                        break

                if phantom_obj == None:
                    continue

                print("++++++++++++++++++++++++++++++")
                print("Depart List = " + str(len(depart_arrival_list)) +
                      " Index =" + str(i))
                # print (depart_arrival_info)
                print(departure_abbr + "," + arrival_abbr)
                print(start_date)
                # print driver_list
                print("++++++++++++++++++++++++++++++")

                stop_date_str = start_date["date"].strftime('%Y-%m-%d %H:%M')

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                class_obj = None
                class_obj = TropicAir(phantom_obj, start_date, departure,
                                      arrival, currentdate, tz, departure_abbr,
                                      arrival_abbr, no_result_info, filename)

                thread_obj = threading.Thread(
                    target=class_obj.parse_website,
                    args=(config.DRIVER_VALUE_PHANTOMJS, ))

                threads.append(thread_obj)
                thread_obj.start()

            for thread in threads:
                if not thread.is_alive():

                    thread.join()
                    threads.remove(thread)

        print("*************************")
        print(len(date_list))
        print(no_result_info)

        finishdate = datetime.now(tz)
        finish_date_str = finishdate.strftime('%Y-%m-%d %H:%M')

        global_sc_obj.save([
            "Departure",
            departure,
            "Arrival",
            arrival,
            "Date Len",
            len(date_list),
            "No Result",
            no_result_info["Count"],
            "File Name",
            filename,
            "Start Date",
            start_date_str,
            "Finish",
            stop_date_str,
            "Capture Date",
            finish_date_str,
        ], "output/output_{}.csv".format(website_type))

        print("*************************")

    try:
        common_lib.upload_file(filename, "output/")
        print "Upload", departure, arrival
    except:
        print("Error while upload :" + filename)