def set_proxy(self, sc_obj): proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) sc_obj.proxy_manager.session_proxy = proxy
def get_proxy(): proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() proxies = { 'http': 'http://{}:{}@{}:{}'.format(proxy_user, proxy_pass, proxy_ip, proxy_port), 'https': 'http://{}:{}@{}:{}'.format(proxy_user, proxy_pass, proxy_ip, proxy_port), } return proxies
def create_phantomjs_driver(): ua = ["D", "W", agent.firefox] # if ua == None: # ua = random_agent() # # e.g. ["M", "A", "Mozilla/5.0 (Android; Mobile; rv:40.0) Gecko/40.0 Firefox/40.0"], # if ua[0] == "M": # Mobile # screen_resolution = random.choice(config.MOBILE_SC) # elif ua[0] == "T": # Tablet # screen_resolution = random.choice(config.TABLET_SC) # else: # elif ua[0] == "D": # Desktop # screen_resolution = random.choice(config.DESKTOP_SC) screen_resolution = [1366, 768] # proxy = random_proxy() proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap[ "phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0" dcap['phantomjs.page.settings.loadImages'] = True dcap['phantomjs.page.settings.resourceTimeout'] = 60000 dcap[ 'phantomjs.page.customHeaders.User-Agent'] = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0" # dcap['phantomjs.page.customHeaders.Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" #dcap['phantomjs.page.customHeaders.Accept-Encoding'] = "gzip, deflate, *" accept_language = "en-US,en;q=0.5" # if accept_language != "": # dcap['phantomjs.page.customHeaders.Accept-Language'] = accept_language proxy_str = "{}:{}".format(proxy_ip, proxy_port) auth_str = "{}:{}".format(proxy_user, proxy_pass) c_type = 'http' service_args = [ '--proxy=%s' % proxy_str, '--proxy-type=%s' % c_type, '--proxy-auth=%s' % auth_str, '--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false' ] print service_args driver = None try: driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) # driver = webdriver.PhantomJS(desired_capabilities=dcap) # driver = webdriver.PhantomJS() except WebDriverException as e: print("webdriver.PhantomJS WebDriverException -> %s" % str(e)) try: if driver: phantom_Quit(driver) except Exception as e: pass return None, ua, proxy, screen_resolution driver.set_window_size(screen_resolution[0], screen_resolution[1]) driver.implicitly_wait(config.DRIVER_WAITING_SECONDS) driver.set_page_load_timeout(config.DRIVER_WAITING_SECONDS) # print driver, ua, proxy_ip, screen_resolution return driver, ua, proxy_ip, screen_resolution
def start_scraping(threads_number): global config website_type = config.CLASS_TYPE_TROPICAIR global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, use_default_logging = False ) tropicair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) except Exception as e: print (e) return sc_obj_list = [] for i in range(0, threads_number): sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, timeout=60, use_default_logging = False ) sc_obj_list.append(sc_obj) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print ('None depart arrival info') return filename = "{}.csv".format(common_lib.get_webiste_str(website_type)) for i, depart_arrival_info in enumerate(depart_arrival_list): threads = [] currentdate = datetime.now(tz) print ("Current Date & Time: {} , {}".format(currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() # for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): # date_list.append({"date":datetime.now(tz) + timedelta(days=step), "status":"none", "error_count":0}) date_list = date_thread_list(threads_number) while len(date_list) > 0: if len(threads) < threads_number: start_date = None bStop = True for date in date_list: if date["status"] != "complete": bStop = False if date["status"] == "none": start_date = date start_date["status"] = "pending" break if bStop == True: break if start_date == None: continue print ("++++++++++++++++++++++++++++++") print ("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print (departure_abbr + "," + arrival_abbr) print ("++++++++++++++++++++++++++++++") sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) s = sc_obj_list[len(date_list) % threads_number] s.proxy_manager.session_proxy = proxy class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr) thread_obj = threading.Thread(target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS,)) # args=(config.DRIVER_VALUE_CHROME,)) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) # filename = "{}_{}_{}_{}.csv".format(common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H')) no_result = 0 for item in date_list: no_result += item["no_result"] stopdate = datetime.now(tz) print ("Finish Date & Time: {} , {}".format(stopdate.strftime('%Y-%m-%d'), stopdate.strftime('%H:%M'))) global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "No Result", no_result, "File Name", filename, "Start", currentdate.strftime('%Y-%m-%d %H:%M'), "Finish", stopdate.strftime('%Y-%m-%d %H:%M') ], "output/output_{}.csv".format(website_type)) print ( "*************************") # break try: common_lib.upload_file(filename, "output/") print "Upload" except: print ( "Error while upload :" + filename)
def start_scraping(threads_number, website_type): global config global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, ) logger = global_sc_obj.logger tropicair_depart_arrival_list = [] mayaislandair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_MAYAISLANDAIR_STR: mayaislandair_depart_arrival_list.append(obj) elif obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) else: raise Exception("Invalid content in relatin csv file") except Exception as e: print(e) return sc_obj_list = [] for i in range(0, threads_number): sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, timeout=300, #log_file='logs/{}_log_{}.txt'.format(website_type, i) ) sc_obj_list.append(sc_obj) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = [] if website_type == config.CLASS_TYPE_MAYAISLANDAIR: depart_arrival_list = mayaislandair_depart_arrival_list elif website_type == config.CLASS_TYPE_TROPICAIR: depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print('None depart arrival info') return #depart_arrival_list = [depart_arrival_list[0]] threads = [] for i, depart_arrival_info in enumerate(depart_arrival_list): currentdate = datetime.now(tz) print("Current Date & Time: {} , {}".format( currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 if website_type == config.CLASS_TYPE_MAYAISLANDAIR: departure_abbr = re.search("\((.*?)\)", departure, re.I | re.S | re.M).group(1).strip() arrival_abbr = re.search("\((.*?)\)", arrival, re.I | re.S | re.M).group(1).strip() start_step = 1 ## This website not scraping today data, so start with +1 elif website_type == config.CLASS_TYPE_TROPICAIR: departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() date_list = [] no_result_info = {"Count": 0} for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): date_list.append({ "date": datetime.now(tz) + timedelta(days=step), "status": "none", "error_count": 0 }) start_date_str = "" while len(date_list) > 0: if len(threads) < threads_number: start_date = None if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT: print("--------------------------") print("No result any more") print("--------------------------") break for date in date_list: if date["status"] == "complete": # print ("Remove Date") # print (date) date_list.remove(date) elif date["status"] == "none": start_date = date start_date["status"] = "pending" break if len(date_list) == 0: break if start_date == None: continue print("++++++++++++++++++++++++++++++") print("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print(departure_abbr + "," + arrival_abbr) print(start_date) print("++++++++++++++++++++++++++++++") start_date_str = start_date["date"].strftime('%Y-%m-%d') sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) s = sc_obj_list[len(date_list) % threads_number] s.proxy_manager.session_proxy = proxy class_obj = None if website_type == config.WEBSITE_TYPE_MAYAISLANDAIR: class_obj = MayaislandAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info) else: class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info) thread_obj = threading.Thread( target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS, )) # args=(config.DRIVER_VALUE_CHROME,)) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) print("*************************") print(len(date_list)) print(no_result_info) filename = "{}_{}_{}_{}.csv".format( common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H')) try: #common_lib.upload_file(filename, "output/") print "Upload" except: print("Error while upload :" + filename) global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "Date Len", len(date_list), "No Result", no_result_info["Count"], "File Name", filename, "Start Date", start_date_str ], "export_{}.csv".format(website_type)) print("*************************")
def parse_website(self, selenium_driver_type): self.class_type = config.CLASS_TYPE_MAYAISLANDAIR_STR self.parent_url = "https://booking.mayaislandair.com/VARS/Public/CustomerPanels/Requirements.aspx" try: no_result = 0 stop_day = "" for ind in range(0, len(self.date_list["date"])): date_item = self.date_list["date"][ind] print "Start Date", date_item print "No Result=", no_result self.start_date = { "date": date_item, "status": "none", "error_count": 0 } self.end_date = { 'date': self.start_date["date"] + timedelta(days=1), 'status': 'pending' } stop_day = date_item if no_result >= config.MAX_NO_RESULT_COUNT: print("--------------------------") print("No result any more") print("--------------------------") break proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) self.scrape_obj.proxy_manager.session_proxy = proxy self.scrape_obj.clear_cookies() bStop = False while bStop == False: print('loading parent page... MayaislandAir') html = self.scrape_obj.load(self.parent_url, use_cache=False) session_id = html.x("//input[@id='VarsSessionID']/@value" ).encode('utf8').strip() start_date_str = self.start_date["date"].strftime( '%d-%b-%Y') end_date_str = self.end_date["date"].strftime('%d-%b-%Y') print("++++++++++++++++++++++") print("Start Date = {}".format(start_date_str)) print("End Date = {}".format(end_date_str)) print("++++++++++++++++++++++") payload = {} form_data = {} form_data["Origin"] = [self.departure_abbr] form_data["VarsSessionID"] = session_id form_data["Destination"] = [self.arrival_abbr] form_data["DepartureDate"] = [start_date_str] form_data["ReturnDate"] = [end_date_str] form_data["Adults"] = "1" form_data["Children"] = "0" form_data["SmallChildren"] = 0 form_data["Seniors"] = "0" form_data["Students"] = "0" form_data["Infants"] = "0" form_data["Youths"] = "0" form_data["Teachers"] = "0" form_data["SeatedInfants"] = "0" form_data["EVoucher"] = "" form_data["SearchUser"] = "******" form_data["SearchSource"] = "refine" payload["FormData"] = form_data payload["IsMMBChangeFlightMode"] = False url = "https://booking.mayaislandair.com/VARS/Public/WebServices/AvailabilityWS.asmx/GetFlightAvailability?VarsSessionID={}".format( session_id) headers = { "Content-Type": "application/json", #"accept-encoding": "gzip, deflate", #"accept-language": "en-US,en;q=0.8", #"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36", } payload = json.dumps(payload) # print url # print payload print('loading get availability page... MayaislandAir') self.scrape_obj.load(url, post=payload, headers=headers, use_cache=False) url = "https://booking.mayaislandair.com/VARS/Public/FlightSelect.aspx" payload = {} payload["VarsSessionID"] = session_id print('loading get info page... MayaislandAir') html = self.scrape_obj.load(url, post=payload, use_cache=False) # with open("response.html", 'w') as f: # f.write(html.encode('utf-8')) print("Call Parse Round Trip Function") departure_fare_id = "" arrival_fare_id = "" saved_item_list = [] self.wait() departure_fare_id, arrival_fare_id, saved_item_list = self.parse_round_trip( html) print("Departure ID= {}".format(departure_fare_id)) print("Arrival ID= {}".format(arrival_fare_id)) if departure_fare_id != "" and arrival_fare_id != "": url = "https://booking.mayaislandair.com/vars/public/WebServices/AvailabilityWS.asmx/AddFlightToBasket?VarsSessionID={}".format( session_id) payload = {} form_data = {} form_data["VarsSessionID"] = session_id form_data["fareData"] = [ departure_fare_id, arrival_fare_id ] form_data["Zone"] = "PUBLIC" payload["addFlightRequest"] = form_data payload = json.dumps(payload) headers = { "Content-Type": "application/json", } self.wait() json_value = self.scrape_obj.load_json(url, post=payload, use_cache=False, headers=headers) html_content = Doc(html=json_value["d"]["Data"]) total_price_str = html_content.x( "//td[@class='BasketGrandTotalPrice']").strip() try: total_fare = re.search("([0-9.]+)", total_price_str, re.I | re.S | re.M).group(1) currency = re.search("([A-Z]+)", total_price_str, re.I | re.S | re.M).group(1) except: self.start_date["error_count"] += 1 if self.start_date[ "error_count"] >= config.MAYAISLAND_SCRAPING_MAX_COUNT: self.start_date["status"] = "complete" else: self.start_date["status"] = "none" return item_list = [] saved_time = datetime.now( self.tz).strftime('%Y-%m-%d %H:%M') for save_item in saved_item_list: item = [ "Search Start Date", save_item["Search_Start_Date"], "Search End Date", save_item["Search_End_Date"], "Departure Date", save_item["Departure_Date"], "Origin", save_item["Origin"], "Destination", save_item["Destination"], "Leave Time", save_item["Leave_Time"], "Arrive Time", save_item["Arrive_Time"], "Duration", save_item["Duration"], "Flight Number", save_item["Flight_Number"], "Airline Fare", save_item["Fare"], "Fare", total_fare, "Airline", self.class_type, "Currency", currency, "Capture Time", saved_time ] item_list.append(item) self.save_item(item_list) self.start_date["status"] = "complete" else: print("***************ERROR**************") print(self.start_date) print("***************ERROR**************") self.start_date["error_count"] += 1 if self.start_date[ "error_count"] >= config.MAYAISLAND_SCRAPING_MAX_COUNT: self.start_date["status"] = "complete" self.date_list["no_result"] += 1 no_result += 1 else: self.start_date["status"] = "none" if self.start_date["status"] == "complete": bStop = True start_day = self.date_list["date"][0] try: end_day = self.date_list["date"][-1] except: end_day = "" self.date_list["status"] = "complete" lock.acquire() global_sc_obj.save([ "Departure", self.departure, "Arrival", self.arrival, "Start", start_day.strftime("%Y-%m-%d"), "End", end_day.strftime("%Y-%m-%d"), "Stop", stop_day.strftime("%Y-%m-%d"), "No Result", no_result, ], "export_{}.csv".format(self.class_type)) lock.release() except Exception as e: print e
def start_scraping(threads_number): global config website_type = config.CLASS_TYPE_TROPICAIR global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, use_default_logging=False) tropicair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) except Exception as e: print(e) return sc_obj_list = [] driver_list = [] for i in range(0, threads_number): driver, user_agent, proxy, screen_resolution = common_lib.create_phantomjs_driver( ) # PHANTOMJS PART driver_list.append({"driver": driver, "status": "none"}) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print('None depart arrival info') return threads = [] file_currentdate = datetime.now(tz) filename = "{}_{}.csv".format(common_lib.get_webiste_str(website_type), file_currentdate.strftime('%Y-%m-%d %H:%M')) for i, depart_arrival_info in enumerate(depart_arrival_list): currentdate = datetime.now(tz) print("Current Date & Time: {} , {}".format( currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 if website_type == config.CLASS_TYPE_TROPICAIR: departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() date_list = [] no_result_info = {"Count": 0} for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): date_list.append({ "date": datetime.now(tz) + timedelta(days=step), "status": "none", "error_count": 0 }) stop_date_str = "" start_date_str = currentdate.strftime('%Y-%m-%d %H:%M') print "************************************" print len(date_list), departure, arrival print "************************************" while len(date_list) > 0: if len(threads) < threads_number: start_date = None phantom_obj = None if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT: print("--------------------------") print("No result any more") print("--------------------------") break # print "+++++++++++++++++++++++++++++++++" # print driver_list # print "+++++++++++++++++++++++++++++++++" for date in date_list: if date["status"] == "complete": date_list.remove(date) elif date["status"] == "none": start_date = date start_date["status"] = "pending" break if len(date_list) == 0: break if start_date == None: continue for driver in driver_list: if driver["status"] == "none": phantom_obj = driver driver["status"] = "pending" break if phantom_obj == None: continue print("++++++++++++++++++++++++++++++") print("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print(departure_abbr + "," + arrival_abbr) print(start_date) # print driver_list print("++++++++++++++++++++++++++++++") stop_date_str = start_date["date"].strftime('%Y-%m-%d %H:%M') sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) class_obj = None class_obj = TropicAir(phantom_obj, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info, filename) thread_obj = threading.Thread( target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS, )) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) print("*************************") print(len(date_list)) print(no_result_info) finishdate = datetime.now(tz) finish_date_str = finishdate.strftime('%Y-%m-%d %H:%M') global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "Date Len", len(date_list), "No Result", no_result_info["Count"], "File Name", filename, "Start Date", start_date_str, "Finish", stop_date_str, "Capture Date", finish_date_str, ], "output/output_{}.csv".format(website_type)) print("*************************") try: common_lib.upload_file(filename, "output/") print "Upload", departure, arrival except: print("Error while upload :" + filename)