Ejemplo n.º 1
0
def checkTimeOutPut(args):
    t = None
    global currCommandProcess
    global stde
    global stdo
    stde = None
    stdo = None
    def executeCommand():
        global currCommandProcess
        global stdo
        global stde
        try:
            stdo, stde = currCommandProcess.communicate()
            printLog('stdout:\n'+str(stdo))
            printLog('stderr:\n'+str(stde))
        except:
            printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()")

    currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,shell=True)
    thread = Thread(target=executeCommand)
    thread.start()
    thread.join(TIMOUT_VAL) #wait for the thread to complete 
    if thread.is_alive():
        printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute')
        currCommandProcess.kill()
        printLog('ERROR: Timed out exception')
        raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT)
    if stdo == "" or stdo==None:
        errCode = currCommandProcess.poll()
        printLog('ERROR: @@@@@Raising Called processor exception')
        raise subprocess.CalledProcessError(errCode, args, output=stde)
    return stdo
Ejemplo n.º 2
0
def checkTimeOutPut(args):
    t = None
    global currCommandProcess
    global stde
    global stdo
    stde = None
    stdo = None
    def executeCommand():
        global currCommandProcess
        global stdo
        global stde
        try:
            stdo, stde = currCommandProcess.communicate()
            printLog('stdout:\n'+str(stdo))
            printLog('stderr:\n'+str(stde))
        except:
            printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()")

    currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    thread = Thread(target=executeCommand)
    thread.start()
    thread.join(TIMOUT_VAL) #wait for the thread to complete 
    if thread.is_alive():
        printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute')
        currCommandProcess.kill()
        printLog('ERROR: Timed out exception')
        raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT)
    if stdo == "" or stdo==None:
        errCode = currCommandProcess.poll()
        printLog('ERROR: @@@@@Raising Called processor exception')
        raise subprocess.CalledProcessError(errCode, args, output=stde)
    return stdo
def MoveToJointPositions(limb, moves, queue, write = True):
    try:
        for move in moves:
            thread = threading.Thread(
                target=move_thread,
                args=(limb,move, queue, write)
            )
            if (move.values()):
                thread.daemon = True
                thread.start()
                baxter_dataflow.wait_for(
                    lambda: not (thread.is_alive()),
                    timeout=20.0,
                    timeout_msg=("Timeout while waiting for %s move thread"
                                 " to finish" % limb.name),
                    rate=10,
                )
                thread.join()
                result = queue.get()
                if not result is None:
                    raise queue.get()
                rospy.sleep(1.0)
    except Exception, exception:
        queue.put(traceback.format_exc())
        queue.put(exception)
Ejemplo n.º 4
0
    def do_GET(self):
        print 'do_GET: ', self.path
        self.path = self.path.split("?")[0]
        if self.path == "/":
            self.path = "/index.html"

        try:
            #Check the file extension required and
            #set the right mime type
            sendReply = False
            if self.path.endswith(".html"):
                mimetype = 'text/html'
                sendReply = True
            if self.path.endswith(".jpg"):
                mimetype = 'image/jpg'
                sendReply = True
            if self.path.endswith(".gif"):
                mimetype = 'image/gif'
                sendReply = True
            if self.path.endswith(".js"):
                mimetype = 'application/javascript'
                sendReply = True
            if self.path.endswith(".css"):
                mimetype = 'text/css'
                sendReply = True

            if sendReply == True:
                #Open the static file requested and send it
                print 'serve file:', curdir + sep + self.path
                f = open(curdir + sep + self.path)
                self.send_response(200)
                self.send_header('Content-type', mimetype)
                self.end_headers()
                self.wfile.write(f.read())
                f.close()

            #print 'Create new thread..'
            print threading.active_count()
            thread = None
            #if not thread or not thread.is_alive():
            if threading.active_count() <= 1:
                print 'start new...'
                thread = Thread(target=self.capture_img)
                thread.start()
            else:
                print 'running...'

            return

        except IOError:
            self.send_error(404, 'File Not Found: %s' % self.path)
            thread = None
            if not thread or not thread.is_alive():
                print 'start new...'
                thread = Thread(target=self.capture_img)
                thread.start()
            else:
                print 'running...'
Ejemplo n.º 5
0
 def __is_task_done(self, threads):
     finishThreadNum = 0
     for thread in threads:
         if not thread.is_alive():
             finishThreadNum += 1
     print "finish " + str(finishThreadNum)
     if finishThreadNum == len(threads):
         return True
     else:
         return False
Ejemplo n.º 6
0
    def run(self, timeout):
        print "running " + self.cmd
        def target():
            self.process = subprocess.Popen(self.cmd, shell=True)
            self.process.communicate()

        thread = threading.Thread(target=target)
        thread.start()

        thread.join(timeout)
        if thread.is_alive():
            print 'Terminating process'
            self.process.terminate()
            thread.join()
Ejemplo n.º 7
0
    def run(self, timeout):
        print "running " + self.cmd

        def target():
            self.process = subprocess.Popen(self.cmd, shell=True)
            self.process.communicate()

        thread = threading.Thread(target=target)
        thread.start()

        thread.join(timeout)
        if thread.is_alive():
            print 'Terminating process'
            self.process.terminate()
            thread.join()
Ejemplo n.º 8
0
 def run(self, timeout=0):
     def target():
         print 'Thread started'
         self.process = subprocess.Popen(self.cmd, shell=True)
         self.process.communicate()
         print 'Thread finished'
     thread = threading.Thread(target=target)
     thread.start()
     if timeout == 0:
         return
     thread.join(timeout)
     if thread.is_alive():
         print 'Terminating process'
         self.process.terminate()
         thread.join()
     print self.process.returncode
Ejemplo n.º 9
0
    def run(self, timeout=0):
        def target():
            print 'Thread started'
            self.process = subprocess.Popen(self.cmd, shell=True)
            self.process.communicate()
            print 'Thread finished'

        thread = threading.Thread(target=target)
        thread.start()
        if timeout == 0:
            return
        thread.join(timeout)
        if thread.is_alive():
            print 'Terminating process'
            self.process.terminate()
            thread.join()
        print self.process.returncode
def start_scraping(threads_number):
    global config
    website_type = config.CLASS_TYPE_TROPICAIR

    global_sc_obj = Scraper(
        use_cache=False, #enable cache globally
        retries=3, 
        use_default_logging = False
    )

    tropicair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)
            
            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)

    except Exception as e:
        print (e)
        return

    sc_obj_list = []

    for i in range(0, threads_number):
        sc_obj = Scraper(
            use_cache=False, #enable cache globally
            retries=3, 
            timeout=60,
            use_default_logging = False
            )
        sc_obj_list.append(sc_obj)

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print ('None depart arrival info')
        return

    filename = "{}.csv".format(common_lib.get_webiste_str(website_type))
    for i, depart_arrival_info in enumerate(depart_arrival_list):
        threads = []

        currentdate = datetime.now(tz)
        print ("Current Date & Time: {} , {}".format(currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure =  depart_arrival_info["Departure"]
        arrival =  depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0

        departure_abbr =  departure.split("-")[1].strip()
        arrival_abbr = arrival.split("-")[1].strip()
    
        
        # for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
        #     date_list.append({"date":datetime.now(tz) + timedelta(days=step), "status":"none", "error_count":0})

        date_list = date_thread_list(threads_number)

        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None
                
                bStop = True
                for date in date_list:
                    if date["status"] != "complete":
                        bStop = False

                    if date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if bStop == True:
                    break

                if start_date == None:
                    continue

                print ("++++++++++++++++++++++++++++++")
                print ("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i))
                # print (depart_arrival_info)
                print (departure_abbr + "," + arrival_abbr)
                print ("++++++++++++++++++++++++++++++")

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()
                
                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                s = sc_obj_list[len(date_list) % threads_number]
                s.proxy_manager.session_proxy = proxy

                class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, 
                    departure_abbr, arrival_abbr)
                
                thread_obj = threading.Thread(target=class_obj.parse_website,
                                              args=(config.DRIVER_VALUE_PHANTOMJS,))
                                            # args=(config.DRIVER_VALUE_CHROME,))

                threads.append(thread_obj)
                thread_obj.start()
            
            for thread in threads:
                if not thread.is_alive():
                    
                    thread.join()
                    threads.remove(thread)

            
        # filename = "{}_{}_{}_{}.csv".format(common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H'))
        
        no_result = 0
        for item in date_list:
            no_result += item["no_result"] 
        
        stopdate = datetime.now(tz)
        print ("Finish Date & Time: {} , {}".format(stopdate.strftime('%Y-%m-%d'), stopdate.strftime('%H:%M')))

        global_sc_obj.save([
            "Departure", departure,
            "Arrival", arrival,
            "No Result", no_result,
            "File Name", filename,
            "Start", currentdate.strftime('%Y-%m-%d %H:%M'),
            "Finish", stopdate.strftime('%Y-%m-%d %H:%M')
            ], "output/output_{}.csv".format(website_type))

        print ( "*************************")
        # break

    try:
        common_lib.upload_file(filename, "output/")
        print "Upload"
    except:
        print ( "Error while upload :" + filename)
def start_scraping(threads_number, website_type):
    global config

    global_sc_obj = Scraper(
        use_cache=False,  #enable cache globally
        retries=3,
    )

    logger = global_sc_obj.logger

    tropicair_depart_arrival_list = []
    mayaislandair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)

            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_MAYAISLANDAIR_STR:
                        mayaislandair_depart_arrival_list.append(obj)
                    elif obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)
                    else:
                        raise Exception("Invalid content in relatin csv file")

    except Exception as e:
        print(e)
        return

    sc_obj_list = []

    for i in range(0, threads_number):
        sc_obj = Scraper(
            use_cache=False,  #enable cache globally
            retries=3,
            timeout=300,
            #log_file='logs/{}_log_{}.txt'.format(website_type, i)
        )
        sc_obj_list.append(sc_obj)

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = []
    if website_type == config.CLASS_TYPE_MAYAISLANDAIR:
        depart_arrival_list = mayaislandair_depart_arrival_list
    elif website_type == config.CLASS_TYPE_TROPICAIR:
        depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print('None depart arrival info')
        return

    #depart_arrival_list = [depart_arrival_list[0]]
    threads = []

    for i, depart_arrival_info in enumerate(depart_arrival_list):

        currentdate = datetime.now(tz)
        print("Current Date & Time: {} , {}".format(
            currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure = depart_arrival_info["Departure"]
        arrival = depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0
        if website_type == config.CLASS_TYPE_MAYAISLANDAIR:
            departure_abbr = re.search("\((.*?)\)", departure,
                                       re.I | re.S | re.M).group(1).strip()
            arrival_abbr = re.search("\((.*?)\)", arrival,
                                     re.I | re.S | re.M).group(1).strip()
            start_step = 1  ## This website not scraping today data, so start with +1
        elif website_type == config.CLASS_TYPE_TROPICAIR:
            departure_abbr = departure.split("-")[1].strip()
            arrival_abbr = arrival.split("-")[1].strip()

        date_list = []

        no_result_info = {"Count": 0}

        for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
            date_list.append({
                "date": datetime.now(tz) + timedelta(days=step),
                "status": "none",
                "error_count": 0
            })

        start_date_str = ""
        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None

                if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT:
                    print("--------------------------")
                    print("No result any more")
                    print("--------------------------")
                    break

                for date in date_list:
                    if date["status"] == "complete":
                        # print ("Remove Date")
                        # print (date)

                        date_list.remove(date)
                    elif date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if len(date_list) == 0:
                    break

                if start_date == None:
                    continue

                print("++++++++++++++++++++++++++++++")
                print("Depart List = " + str(len(depart_arrival_list)) +
                      " Index =" + str(i))
                # print (depart_arrival_info)
                print(departure_abbr + "," + arrival_abbr)
                print(start_date)
                print("++++++++++++++++++++++++++++++")

                start_date_str = start_date["date"].strftime('%Y-%m-%d')

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                s = sc_obj_list[len(date_list) % threads_number]
                s.proxy_manager.session_proxy = proxy

                class_obj = None
                if website_type == config.WEBSITE_TYPE_MAYAISLANDAIR:
                    class_obj = MayaislandAir(s, start_date, departure,
                                              arrival, currentdate, tz,
                                              departure_abbr, arrival_abbr,
                                              no_result_info)
                else:
                    class_obj = TropicAir(s, start_date, departure, arrival,
                                          currentdate, tz, departure_abbr,
                                          arrival_abbr, no_result_info)

                thread_obj = threading.Thread(
                    target=class_obj.parse_website,
                    args=(config.DRIVER_VALUE_PHANTOMJS, ))
                # args=(config.DRIVER_VALUE_CHROME,))

                threads.append(thread_obj)
                thread_obj.start()

            for thread in threads:
                if not thread.is_alive():

                    thread.join()
                    threads.remove(thread)

        print("*************************")
        print(len(date_list))
        print(no_result_info)
        filename = "{}_{}_{}_{}.csv".format(
            common_lib.get_webiste_str(website_type), departure_abbr,
            arrival_abbr, currentdate.strftime('%Y-%b-%d %H'))
        try:
            #common_lib.upload_file(filename, "output/")
            print "Upload"
        except:
            print("Error while upload :" + filename)

        global_sc_obj.save([
            "Departure", departure, "Arrival", arrival, "Date Len",
            len(date_list), "No Result", no_result_info["Count"], "File Name",
            filename, "Start Date", start_date_str
        ], "export_{}.csv".format(website_type))

        print("*************************")
Ejemplo n.º 12
0
    def poll(self):
        if not self.lock_poll.acquire( 0 ):
            print "Loop::poll() could not acquire lock"
            return

        try:
            # check for completed threads
            for thread in self.threads:
                if not thread:
                    self.threads.remove( thread )
                elif not thread.is_alive():
                    station = thread.get_station()
                    if station:
                        station._log( "Wrapping up thread (" + str(len(self.threads) - 1) + " threads remaining)" )
                        if action == 'check':
                            self.record( station )
                    self.threads.remove( thread )

            # check for failed threads
            while len(self.threads) < self.max_threads:
                # check for stations in original list
                if len(self.stations):
                    station_info = self.stations.pop(0)
                # once the original list is exhausted, check for retry stations
                elif len(self.stations_retry):
                    station_info = self.stations_retry.pop(0)
                # if there are no threads remaining all stations have been checked
                elif not len(self.threads):
                    self.summarize()
                    self.poller.stop()
                    self.done = True
                    #self.lock_poll.release()
                    raise ExLoopDone, "No stations remaining."
                else:
                    # This occurs when we have no stations
                    # in either the default or retry queues, and
                    # we have at least one but less than 
                    # self.max_threads still running.
                    break

                station = None
                try:
                    if ( station_info.has_key('disabled') and (station_info['disabled'] == 'true') ):
                        raise ExStationDisabled, "Station is disabled"
                    if ( station_info['type'] == 'Q680' ):
                        if self.continuity_only:
                            raise Exception("Continuity checks, Q680s not supported")
                        if self.versions_only:
                            raise Exception("Software version checks, Q680s not supported")
                        station = Station680()
                    elif ( station_info['type'] == 'Q330' ):
                        station = Station330(legacy=False, continuity_only=self.continuity_only, versions_only=self.versions_only)
                        station.set_version_queue(self.version_queue)
                        station.set_version_files(self.version_files)
                    elif ( station_info['type'] == 'Q330C' ):
                        station = Station330(legacy=True, continuity_only=self.continuity_only, versions_only=self.versions_only)
                        station.set_version_queue(self.version_queue)
                        station.set_version_files(self.version_files)
                    else:
                        raise ExStationTypeNotRecognized, "Station type not recognized"
                    self.prep_station(station, station_info)
                    self.find_proxies(station, station_info)

                    permissions = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH
                    date = time.gmtime()

                    dir = self.output_directory + '/' + station.name
                    try:
                        if not os.path.exists(dir): os.makedirs(dir)
                        if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions)
                    except:
                        raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir

                    dir += time.strftime("/%Y", date)
                    try:
                        if not os.path.exists(dir): os.makedirs(dir)
                        if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions)
                    except:
                        raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir

                    dir += time.strftime("/%j", date)
                    try:
                        if not os.path.exists(dir): os.makedirs(dir)
                        if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions)
                    except:
                        raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir

                    file = "%s/%s.log" % (dir, action)

                    station.log_file_name(file)
                    station.log_to_file()
                    station.log_to_screen()
                    station.set_output_directory(self.output_directory + '/' + station.name)


                    if not station.min_info():
                        self.stations_partial.append(station_info)
                    thread = ThreadStation()
                    thread.set_station( station )
                    thread.set_info( station_info )
                    station._log( "Starting thread" )
                    thread.start()
                    self.threads.append(thread)
                except Exception, e:
                    if station:
                        print "Loop::poll() failed to create thread. Exception: %s" % str(e)
                    else:
                        print "Loop::poll() failed to create station object. Exception: %s" % str(e)
        except ExLoopDone, e:
            print "All stations have been processed"
Ejemplo n.º 13
0
def start_scraping(threads_number):
    global config
    website_type = config.CLASS_TYPE_TROPICAIR

    global_sc_obj = Scraper(
        use_cache=False,  #enable cache globally
        retries=3,
        use_default_logging=False)

    tropicair_depart_arrival_list = []

    try:
        with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile:
            reader = csv.reader(csvfile)

            for i, item in enumerate(reader):
                if i > 0 and item[0] != "" and item[1] != "":
                    obj = {}
                    obj["Departure"] = item[0]
                    obj["Arrival"] = item[1]
                    obj["Type"] = item[2]

                    if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR:
                        tropicair_depart_arrival_list.append(obj)

    except Exception as e:
        print(e)
        return

    sc_obj_list = []
    driver_list = []

    for i in range(0, threads_number):
        driver, user_agent, proxy, screen_resolution = common_lib.create_phantomjs_driver(
        )  # PHANTOMJS PART
        driver_list.append({"driver": driver, "status": "none"})

    tz = pytz.timezone('America/Los_Angeles')

    depart_arrival_list = tropicair_depart_arrival_list

    if len(depart_arrival_list) == 0:
        print('None depart arrival info')
        return

    threads = []

    file_currentdate = datetime.now(tz)
    filename = "{}_{}.csv".format(common_lib.get_webiste_str(website_type),
                                  file_currentdate.strftime('%Y-%m-%d %H:%M'))

    for i, depart_arrival_info in enumerate(depart_arrival_list):
        currentdate = datetime.now(tz)
        print("Current Date & Time: {} , {}".format(
            currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M')))

        departure = depart_arrival_info["Departure"]
        arrival = depart_arrival_info["Arrival"]

        departure_abbr = ""
        arrival_abbr = ""

        start_step = 0
        if website_type == config.CLASS_TYPE_TROPICAIR:
            departure_abbr = departure.split("-")[1].strip()
            arrival_abbr = arrival.split("-")[1].strip()

        date_list = []

        no_result_info = {"Count": 0}

        for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED):
            date_list.append({
                "date": datetime.now(tz) + timedelta(days=step),
                "status": "none",
                "error_count": 0
            })

        stop_date_str = ""
        start_date_str = currentdate.strftime('%Y-%m-%d %H:%M')

        print "************************************"
        print len(date_list), departure, arrival
        print "************************************"

        while len(date_list) > 0:
            if len(threads) < threads_number:
                start_date = None
                phantom_obj = None

                if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT:
                    print("--------------------------")
                    print("No result any more")
                    print("--------------------------")
                    break

                # print "+++++++++++++++++++++++++++++++++"
                # print driver_list
                # print "+++++++++++++++++++++++++++++++++"

                for date in date_list:
                    if date["status"] == "complete":
                        date_list.remove(date)
                    elif date["status"] == "none":
                        start_date = date
                        start_date["status"] = "pending"
                        break

                if len(date_list) == 0:
                    break

                if start_date == None:
                    continue

                for driver in driver_list:
                    if driver["status"] == "none":
                        phantom_obj = driver
                        driver["status"] = "pending"
                        break

                if phantom_obj == None:
                    continue

                print("++++++++++++++++++++++++++++++")
                print("Depart List = " + str(len(depart_arrival_list)) +
                      " Index =" + str(i))
                # print (depart_arrival_info)
                print(departure_abbr + "," + arrival_abbr)
                print(start_date)
                # print driver_list
                print("++++++++++++++++++++++++++++++")

                stop_date_str = start_date["date"].strftime('%Y-%m-%d %H:%M')

                sleep(config.DRIVER_SHORT_WAITING_SECONDS)
                proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy()

                if proxy_user != None:
                    auth_str = "{}:{}".format(proxy_user, proxy_pass)
                    proxy = Proxy(proxy_ip, proxy_port, auth_str)
                else:
                    proxy = Proxy(proxy_ip, proxy_port)

                class_obj = None
                class_obj = TropicAir(phantom_obj, start_date, departure,
                                      arrival, currentdate, tz, departure_abbr,
                                      arrival_abbr, no_result_info, filename)

                thread_obj = threading.Thread(
                    target=class_obj.parse_website,
                    args=(config.DRIVER_VALUE_PHANTOMJS, ))

                threads.append(thread_obj)
                thread_obj.start()

            for thread in threads:
                if not thread.is_alive():

                    thread.join()
                    threads.remove(thread)

        print("*************************")
        print(len(date_list))
        print(no_result_info)

        finishdate = datetime.now(tz)
        finish_date_str = finishdate.strftime('%Y-%m-%d %H:%M')

        global_sc_obj.save([
            "Departure",
            departure,
            "Arrival",
            arrival,
            "Date Len",
            len(date_list),
            "No Result",
            no_result_info["Count"],
            "File Name",
            filename,
            "Start Date",
            start_date_str,
            "Finish",
            stop_date_str,
            "Capture Date",
            finish_date_str,
        ], "output/output_{}.csv".format(website_type))

        print("*************************")

    try:
        common_lib.upload_file(filename, "output/")
        print "Upload", departure, arrival
    except:
        print("Error while upload :" + filename)
Ejemplo n.º 14
0
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        True for depth first search
    """
    running = True
    lock = threading.Lock()
    def add_iter_urls():
        if lock.acquire(False):
            for url in url_iter or []:
                download_queue.append(url)
                break
            lock.release()


    def process_queue():
        """Thread for downloading webpages
        """
        D = Download(**kwargs)

        while True:
            try:
                url = download_queue.pop() if depth else download_queue.popleft()

            except IndexError:
                add_iter_urls()
                break

            else:
                # download this url
                html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                if cb:
                    try:
                        # use callback to process downloaded HTML
                        result = cb(D, url, html)

                    except StopCrawl:
                        common.logger.info('Stopping crawl signal')
                        running = False

                    except Exception:
                        # catch any callback error to avoid losing thread
                        common.logger.exception('\nIn callback for: ' + str(url))

                    else:
                        # add these URL's to crawl queue
                        for link in result or []:
                            download_queue.append(urlparse.urljoin(url, link))
                                        
                # update the crawler state
                # no download or error so must have read from cache
                num_caches = 0 if D.num_downloads or D.num_errors else 1
                state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue))

    download_queue = collections.deque()
    if urls:
        download_queue.extend(urls)
    if url:
        download_queue.append(url)
    common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(download_queue))

    # wait for all download threads to finish
    threads = []
    while running and (threads or download_queue):
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < num_threads and download_queue:
            # cat start more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Ejemplo n.º 15
0
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        Deprecated - will be removed in later version
    wait_finish:
        whether to wait until all download threads have finished before returning
    reuse_queue:
        Whether to continue the queue from the previous run.
    max_queue:
        The maximum number of queued URLs to keep in memory.
        The rest will be in the cache.
    """
    if kwargs.pop('cache', None):
        common.logger.debug('threaded_get does not support cache flag')
    lock = threading.Lock()


    class DownloadThread(threading.Thread):
        """Thread for downloading webpages
        """
        processing = collections.deque() # to track whether are still downloading
        discovered = {} # the URL's that have been discovered

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            D = Download(**kwargs)
            queue = pdict.Queue(settings.queue_file)

            while seed_urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1) 
                try:
                    url = seed_urls.pop()

                except IndexError:
                    # currently no urls to process
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)

                else:
                    try:
                        # download this url
                        html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                        if cb:
                            try:
                                # use callback to process downloaded HTML
                                result = cb(D, url, html)

                            except Exception, e:
                                # catch any callback error to avoid losing thread
                                common.logger.exception('\nIn callback for: ' + str(url))

                            else:
                                # add these URL's to crawl queue
                                for link in result or []:
                                    cb_url = urlparse.urljoin(url, link)
                                    if isinstance(result, dict):
                                        DownloadThread.discovered[cb_url] = result[link]
                                    else:
                                        DownloadThread.discovered[cb_url] = DEFAULT_PRIORITY
                                            
                                if len(seed_urls) < max_queue:
                                    # need to request more queue
                                    if DownloadThread.discovered or len(queue) > 0:
                                        # there are outstanding in the queue
                                        if lock.acquire(False):
                                            # no other thread is downloading
                                            common.logger.debug('Loading from queue: %d' % len(seed_urls))
                                            discovered = []
                                            while DownloadThread.discovered:
                                                discovered.append(DownloadThread.discovered.popitem())
                                            queue.push(discovered)
                                            # get next batch of URLs from cache
                                            seed_urls.extend(queue.pull(limit=max_queue))
                                            lock.release()
                    finally:
                        # have finished processing
                        # make sure this is called even on exception to avoid eternal loop
                        DownloadThread.processing.pop()
                    # update the crawler state
                    # no download or error so must have read from cache
                    num_caches = 0 if D.num_downloads or D.num_errors else 1
                    state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue))


    queue = pdict.Queue(settings.queue_file)
    if reuse_queue:
        # command line flag to enable queue
        queued_urls = queue.pull(limit=max_queue)
    else:
        queued_urls = []
    if queued_urls:
        # continue the previous crawl
        seed_urls = collections.deque(queued_urls)
        common.logger.debug('Loading crawl queue')
    else:
        # remove any queued URL's so can crawl again
        queue.clear()
        urls = urls or []
        if url:
            urls.append(url)
        queue.push([(url, DEFAULT_PRIORITY) for url in urls])
        # put urls into thread safe queue
        seed_urls = collections.deque(queue.pull(limit=max_queue))
        common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(queue))

    # start the download threads
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
        thread.start()

    # Wait for all download threads to finish
    while threads and wait_finish:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Ejemplo n.º 16
0
def threaded_get(url=None,
                 urls=None,
                 url_iter=None,
                 num_threads=10,
                 dl=None,
                 cb=None,
                 depth=True,
                 **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        True for depth first search
    """
    running = True
    lock = threading.Lock()

    def add_iter_urls():
        if lock.acquire(False):
            for url in url_iter or []:
                download_queue.append(url)
                break
            lock.release()

    def process_queue():
        """Thread for downloading webpages
        """
        D = Download(**kwargs)

        while True:
            try:
                url = download_queue.pop(
                ) if depth else download_queue.popleft()

            except IndexError:
                add_iter_urls()
                break

            else:
                # download this url
                html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                if cb:
                    try:
                        # use callback to process downloaded HTML
                        result = cb(D, url, html)

                    except StopCrawl:
                        common.logger.info('Stopping crawl signal')
                        self.running = False

                    except Exception:
                        # catch any callback error to avoid losing thread
                        common.logger.exception('\nIn callback for: ' +
                                                str(url))

                    else:
                        # add these URL's to crawl queue
                        for link in result or []:
                            download_queue.append(link)

                # update the crawler state
                # no download or error so must have read from cache
                num_caches = 0 if D.num_downloads or D.num_errors else 1
                state.update(num_downloads=D.num_downloads,
                             num_errors=D.num_errors,
                             num_caches=num_caches,
                             queue_size=len(download_queue))

    download_queue = collections.deque()
    if urls:
        download_queue.extend(urls)
    if url:
        download_queue.append(url)
    common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(download_queue))

    # wait for all download threads to finish
    threads = []
    while running and (threads or download_queue):
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < num_threads and download_queue:
            # cat start more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Ejemplo n.º 17
0
def threaded_get(url=None,
                 urls=None,
                 num_threads=10,
                 dl=None,
                 cb=None,
                 depth=None,
                 wait_finish=True,
                 reuse_queue=False,
                 max_queue=1000,
                 **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        Deprecated - will be removed in later version
    wait_finish:
        whether to wait until all download threads have finished before returning
    use_queue:
        Whether to continue the queue from the previous run.
    max_queue:
        The maximum number of queued URLs to keep in memory.
        The rest will be in the cache.
    """
    if kwargs.pop('cache', None):
        common.logger.debug('threaded_get does not support cache flag')
    lock = threading.Lock()

    class DownloadThread(threading.Thread):
        """Thread for downloading webpages
        """
        processing = collections.deque(
        )  # to track whether are still downloading
        discovered = {}  # the URL's that have been discovered

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            D = Download(**kwargs)
            queue = pdict.Queue(settings.queue_file)

            while seed_urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1)
                try:
                    url = seed_urls.pop()

                except IndexError:
                    # currently no urls to processa
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)

                else:
                    try:
                        # download this url
                        html = dl(D, url, **kwargs) if dl else D.get(
                            url, **kwargs)
                        if cb:
                            try:
                                # use callback to process downloaded HTML
                                cb_urls = cb(D, url, html)

                            except Exception, e:
                                # catch any callback error to avoid losing thread
                                common.logger.error('in callback for: ' +
                                                    str(url) + '\n' +
                                                    traceback.format_exc())

                            else:
                                # add these URL's to crawl queue
                                for cb_url in cb_urls or []:
                                    if isinstance(cb_urls, dict):
                                        DownloadThread.discovered[
                                            cb_url] = cb_urls[cb_url]
                                    else:
                                        DownloadThread.discovered[
                                            cb_url] = DEFAULT_PRIORITY

                                if len(seed_urls) < max_queue:
                                    # need to request more queue
                                    if DownloadThread.discovered or len(
                                            queue) > 0:
                                        # there are outstanding in the queue
                                        if lock.acquire(False):
                                            # no other thread is downloading
                                            common.logger.debug(
                                                'Loading from queue: %d' %
                                                len(seed_urls))
                                            discovered = []
                                            while DownloadThread.discovered:
                                                discovered.append(
                                                    DownloadThread.discovered.
                                                    popitem())
                                            queue.push(discovered)
                                            # get next batch of URLs from cache
                                            seed_urls.extend(
                                                queue.pull(limit=max_queue))
                                            lock.release()
                                """
                                for cb_url in cb_urls or []:
                                    if cb_url not in DownloadThread.discovered:
                                        DownloadThread.discovered[cb_url] = 1
                                        seed_urls.append(cb_url)
                                """
                    finally:
                        # have finished processing
                        # make sure this is called even on exception to avoid eternal loop
                        DownloadThread.processing.pop()
                    # update the crawler state
                    # no download or error so must have read from cache
                    num_caches = 0 if D.num_downloads or D.num_errors else 1
                    state.update(num_downloads=D.num_downloads,
                                 num_errors=D.num_errors,
                                 num_caches=num_caches,
                                 queue_size=len(queue))

    queue = pdict.Queue(settings.queue_file)
    if reuse_queue:
        # command line flag to enable queue
        queued_urls = queue.pull(limit=max_queue)
    else:
        queued_urls = []
    if queued_urls:
        # continue the previous crawl
        seed_urls = collections.deque(queued_urls)
        common.logger.debug('Loading crawl queue')
    else:
        # remove any queued URL's so can crawl again
        queue.clear()
        urls = urls or []
        if url:
            urls.append(url)
        queue.push([(url, DEFAULT_PRIORITY) for url in urls])
        # put urls into thread safe queue
        seed_urls = collections.deque(queue.pull(limit=max_queue))
        common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(queue))

    # start the download threads
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(
            True)  # set daemon so main thread can exit when receives ctrl-c
        thread.start()

    # Wait for all download threads to finish
    while threads and wait_finish:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()