def crawler_req(visa_type, place): try: # prepare session sess = session_op.get_session(visa_type, place) if not sess: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "No Session")) return cookies = copy.deepcopy(g.COOKIES) cookies["sid"] = sess # send request r = requests.get(g.CANCEL_URI, headers=g.HEADERS, cookies=cookies, proxies=g.value("proxies", None)) if r.status_code != 200: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) return # parse HTML page = r.text date = get_date(page) if not date: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) return elif date == (0, 0, 0): logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Date Not Found")) last_status = g.value("status_%s_%s" % (visa_type, place), (0, 0, 0)) if last_status != (0, 0, 0): session_op.replace_session(visa_type, place, sess) elif not check_alive(page): logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) return logger.info("%s, %s, SUCCESS, %s" % (visa_type, place, date)) g.assign("status_%s_%s" % (visa_type, place), date) except: logger.error(traceback.format_exc())
def check_crawler_server_connection(): """ Check the connection of all the crawler server. Update the current crawler server in use. """ if G.value('checking_crawler_connection', False): return G.assign('checking_crawler_connection', True) crawler_path = G.value('crawler_path', None) previous_crawler_node = G.value('current_crawler_node', '') if crawler_path is None or not os.path.exists(crawler_path): LOGGER.warning( 'GlobalVar crawler file path is not found or path not valid.') G.assign('checking_crawler_connection', False) return with open(crawler_path) as f: crawler_server_lst = [line.strip() for line in f.readlines()] for crawler_node in crawler_server_lst: try: res = requests.get(crawler_node, timeout=5) if res.status_code == 200 and previous_crawler_node != crawler_node: G.assign('current_crawler_node', crawler_node) LOGGER.warning('Choose crawler node: %s', crawler_node) G.assign('checking_crawler_connection', False) return except Exception: pass LOGGER.error('All crawler servers fail!') G.assign('checking_crawler_connection', False)
def crawler_req(visa_type, place, start_time, requests): try: # prepare session sess = session_op.get_session(visa_type, place) if not sess: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, place, "No Session")) return refresh_endpoint = g.value( "crawler_node", "") + "/refresh/?session=" + sess try: r = requests.get(refresh_endpoint, timeout=7, proxies=g.value("proxies", None)) except: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, place, "Endpoint Timeout")) check_crawler_node() return if r.status_code != 200: logger.warning("%s, %s, %s, FAILED, %s" % ( start_time, visa_type, place, "Endpoint Inaccessible")) check_crawler_node() return result = r.json() if result["code"] > 0: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) return date = tuple(map(int, result["msg"].split("-"))) logger.info("%s, %s, %s, SUCCESS, %s" % (start_time, visa_type, place, date)) g.assign("status_%s_%s" % (visa_type, place), date) except: logger.error(traceback.format_exc())
def crawler(visa_type, places): localtime = time.localtime() s = {'time': time.strftime('%Y/%m/%d %H:%M:%S', localtime)} cur = time.strftime('%Y/%m/%d', time.localtime()) cur_time = time.strftime('%H:%M:%S', time.localtime()) pool = [] req = g.value(visa_type + "_req", requests.Session()) for place in places: t = threading.Thread( target=crawler_req, args=(visa_type, place, cur_time, req) ) t.start() pool.append(t) for t in pool: t.join() # write to file for place in places: n = place + '-' + cur n2 = place + '2-' + cur y, m, d = g.value("status_%s_%s" % (visa_type, place), (0, 0, 0)) s[n] = s[n2] = '{}/{}/{}'.format(y, m, d) if y > 0 else "/" if s[n] != '/': path = visa_type + '/' + n.replace('-', '/') os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True) time_hm = time.strftime('%H:%M', localtime) open(path, 'a+').write(time_hm + ' ' + s[n] + '\n') merge('../visa/visa.json' if visa_type == "F" else '../visa/visa-%s.json' % visa_type.lower(), s, cur, visa_type)
def check_crawler_node(): if g.value("crawler_checking", False): return g.assign("crawler_checking", True) crawler_filepath = g.value("crawler_path", None) last_node = g.value("crawler_node", "") if not crawler_filepath: logger.warning("Crawler file not found") g.assign("crawler_checking", False) return with open(crawler_filepath, "r") as f: nodes = list(f.readlines()) for node in nodes: node = node.strip() try: r = requests.get(node, timeout=5) if r.status_code == 200: if last_node != node: g.assign("crawler_node", node) logger.warning("Choose Crawler Node: " + node) g.assign("crawler_checking", False) return except: pass logger.error("All Crawler Nodes Failed") g.assign("crawler_checking", False)
def get_session(self, visa_type, place): # get a session given visa type and place. return None if failed. session = g.value("session", {}) if visa_type not in session or place not in session[visa_type]: return None idx = g.value("idx_%s_%s" % (visa_type, place), 0) sess_list = session[visa_type][place] if len(sess_list) == 0: return None sess = sess_list[idx % len(sess_list)] logger.debug("session: " + sess) g.assign("idx_%s_%s" % (visa_type, place), idx + 1) return sess
def merge(fn, s, cur, visa_type): status = g.value("merge_lock" + visa_type, 0) if status == 1: return g.assign("merge_lock" + visa_type, 1) orig = json.loads(open(fn).read()) if os.path.exists(fn) else {} open(fn.replace('.json', '-last.json'), 'w').write(json.dumps(orig, ensure_ascii=False)) last = copy.deepcopy(orig) for k in s: if '2-' in k: orig[k] = min_date(orig.get(k, '/'), s[k]) else: orig[k] = s[k] if cur not in orig.get('index', []): orig['index'] = [cur] + orig.get('index', []) orig['index'], o = orig['index'][:50], orig['index'][50:] rmkeys = [i for i in orig if i.split('-')[-1] in o] for r in rmkeys: orig.pop(r) open(fn, 'w').write(json.dumps(orig, ensure_ascii=False)) g.assign("merge_lock" + visa_type, 0) subprocess.check_call([ 'python3', 'notify.py', '--type', visa_type, '--js', json.dumps(orig, ensure_ascii=False), '--last_js', json.dumps(last, ensure_ascii=False) ])
def set_fetching_interval(visa_type: str, location: str, sys: str, interval_sec: int, first_run: bool = True): """ Execute the fetching function every `interval` seconds https://stackoverflow.com/questions/2697039/python-equivalent-of-setinterval """ def function_wrapper(): set_fetching_interval(visa_type, location, sys, interval_sec, first_run=False) VisaFetcher.fetch_visa_status( visa_type, location, G.value(f'{visa_type}_requests_Session', requests.Session())) emb = G.USEmbassy.get_embassy_by_loc(location) now_minute = datetime.now().minute if sys == 'cgi' and visa_type == "F" and 47 <= now_minute < 49 and emb.region == 'DOMESTIC' and emb.code not in [ 'hk', 'hkr', 'tp' ]: interval = 5 else: interval = interval_sec fetching_thread = threading.Timer(interval, function_wrapper) fetching_thread.start() if first_run: # execute fecthing without waiting for the first time. VisaFetcher.fetch_visa_status( visa_type, location, G.value(f'{visa_type}_requests_Session', requests.Session())) return fetching_thread
def start_threads(): """ Start the threads for fetching data from crawler server.""" LOGGER.info('Setting up crawler node...') VisaFetcher.check_crawler_server_connection() LOGGER.info('Starting threads...') LOGGER.info('Setting up session update consumer...') session_update_consumer = threading.Thread( target=VisaFetcher.consume_new_session_request) session_update_consumer.start() LOGGER.info('Setting interval for fetching visa status...') sys = G.value('target_system', None) thread_pool = [] for visa_type, interval_sec in G.FETCH_TIME_INTERVAL[sys].items(): for location in G.SYS_LOCATION[sys]: if location[-1] == 'u' and sys == 'cgi' and visa_type != 'F': continue thread_pool.append( set_fetching_interval(visa_type, location, sys, interval_sec)) LOGGER.info('Fetching threads start, %s threads in total', len(thread_pool)) for thread in thread_pool: thread.join()
def crawler(visa_type, places): open(visa_type + '_state', 'w').write('1') localtime = time.localtime() s = {'time': time.strftime('%Y/%m/%d %H:%M:%S', localtime)} second = localtime.tm_sec cur = time.strftime('%Y/%m/%d', time.localtime()) pool = [] for place in places: t = threading.Thread( target=crawler_req, args=(visa_type, place) ) t.start() pool.append(t) for t in pool: t.join() # write to file for place in places: n = place + '-' + cur n2 = place + '2-' + cur y, m, d = g.value("status_%s_%s" % (visa_type, place), (0, 0, 0)) s[n] = s[n2] = '{}/{}/{}'.format(y, m, d) if y > 0 else "/" if s[n] != '/': path = visa_type + '/' + n.replace('-', '/') os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True) time_hm = time.strftime('%H:%M', localtime) open(path, 'a+').write(time_hm + ' ' + s[n] + '\n') merge('../visa/visa.json' if visa_type == "F" else '../visa/visa-%s.json' % visa_type.lower(), s, cur) open(visa_type + '_state', 'w').write('0') os.system('python3 notify.py --type ' + visa_type + ' &')
def get_session_count(self, visa_type, place): session_list = g.value("session", {}) if visa_type not in session_list: session_list[visa_type] = {} if place not in session_list[visa_type]: session_list[visa_type][place] = [] return len(session_list[visa_type][place])
def get_session_count(visa_type, place): session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] return len(session_list[visa_type][place])
def add_session(): while True: visa_type, place, replace = replace_items.get() # check if replaced if replace: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if not replace in session_list[visa_type][place]: continue logger.info("Update session " + replace) try: cracker = g.value("cracker", None) username, passwd, sid = login(cracker, place) print( f'Login Info: Username: {username}, Password: {passwd}, Session ID: {sid}' ) date = visa_select(visa_type, place, sid) if not date: print('date is None from `visa_select`') continue localtime = time.asctime(time.localtime(time.time())) print( f'[ {localtime} ] Earliest appointment for {visa_type} visa at {place}: {date}' ) try: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if replace: idx = session_list[visa_type][place].index(replace) session_list[visa_type][place][idx] = sid else: session_list[visa_type][place].append(sid) session_file = g.value("session_file", "session.json") with open(session_file, "w") as f: f.write(json.dumps(session_list, ensure_ascii=False)) except: logger.error(traceback.format_exc()) except: logger.error(traceback.format_exc())
def function_wrapper(): set_fetching_interval(visa_type, location, sys, interval_sec, first_run=False) VisaFetcher.fetch_visa_status( visa_type, location, G.value(f'{visa_type}_requests_Session', requests.Session()))
def add_session(): while True: visa_type, place, replace = replace_items.get() # check if replaced if replace: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if not replace in session_list[visa_type][place]: continue logger.info("Update session " + replace) try: endpoint = g.value( "crawler_node", "") + "/register/?type=%s&place=%s" % (visa_type, place) r = requests.get(endpoint, timeout=40, proxies=g.value("proxies", None)) result = r.json() date = tuple(map(int, result["msg"].split("-"))) sid = result["session"] if not date: continue try: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if replace: idx = session_list[visa_type][place].index(replace) session_list[visa_type][place][idx] = sid else: session_list[visa_type][place].append(sid) session_file = g.value("session_file", "session.json") with open(session_file, "w") as f: f.write(json.dumps(session_list, ensure_ascii=False)) except: logger.error(traceback.format_exc()) except: logger.error(traceback.format_exc())
def init_cache(self): session_file = g.value("session_file", "session.json") session = {} if os.path.exists(session_file): with open(session_file, "r") as f: try: session = json.load(f) except: pass g.assign("session", session)
def crawler_req_ais(visa_type, code, places, start_time, requests): try: # prepare session sess, scedule_id = session_op.get_session(visa_type, code) if not sess: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, code, "No Session")) return refresh_endpoint = g.value( "crawler_node", "") + "/ais/refresh/?code=%s&id=%s&session=%s" % ( code, scedule_id, sess) try: r = requests.get(refresh_endpoint, timeout=7, proxies=g.value("proxies", None)) except: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, code, "Endpoint Timeout")) check_crawler_node() return if r.status_code != 200: logger.warning( "%s, %s, %s, FAILED, %s" % (start_time, visa_type, code, "Endpoint Inaccessible")) check_crawler_node() return result = r.json() if result["code"] > 0: logger.warning("%s, %s, %s, FAILED, %s" % (start_time, visa_type, code, "Session Expired")) session_op.replace_session(visa_type, code, sess) return date_list = result["msg"] new_sess = result["session"] session_op.replace_session_immediate(visa_type, code, sess, new_sess) for place, date in date_list: if place not in places: continue logger.info("%s, %s, %s, %s, SUCCESS, %s" % (start_time, visa_type, code, place, date)) g.assign("status_%s_%s" % (visa_type, place), date) except: logger.error(traceback.format_exc())
def replace_session_immediate(self, visa_type, place, sess, new_sess): ais = "-" in place session_list = g.value("session", {}) if visa_type not in session_list: session_list[visa_type] = {} if place not in session_list[visa_type]: session_list[visa_type][place] = [] if ais and sess not in [x[0] for x in session_list[visa_type][place]]: return if not ais and not sess in session_list[visa_type][place]: return if ais: idx = [x[0] for x in session_list[visa_type][place]].index(sess) session_list[visa_type][place][idx][0] = new_sess else: idx = session_list[visa_type][place].index(sess) session_list[visa_type][place][idx] = new_sess session_file = g.value("session_file", "session.json") with open(session_file, "w") as f: f.write(json.dumps(session_list, ensure_ascii=False))
def set_session_pool_size(visa_type, place, size): session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] cnt = len(session_list[visa_type][place]) if cnt < size: for _ in range(size - cnt): rand_str = "".join([chr(np.random.randint(26) + ord('a')) for _ in range(15)]) session_list[visa_type][place].append("placeholder_" + rand_str) elif cnt > size: session_list[visa_type][place] = session_list[visa_type][place][:size]
def __init__(self) -> None: self.session = defaultdict(lambda: defaultdict(list)) self.session_idx = defaultdict(lambda: defaultdict(int)) now = datetime.now() self.session_avail = defaultdict(lambda: defaultdict(lambda: now)) self.logger = logging.getLogger(G.GlobalVar.var_dct['log_name']) # read cached session pool (if any) sys = G.value('target_system', None) session_file = G.value('session_file', 'session.json') if sys is None: self.logger.error('Not target system given') raise ValueError('The target system is not set!') if os.path.exists(session_file): with open(session_file) as f: try: old_session = json.load(f) if not isinstance(old_session, dict): raise TypeError() except json.decoder.JSONDecodeError: self.logger.debug( 'session.json is empty or borken written') except TypeError: self.logger.debug( 'session.json doesn\'t store a dictionary.') else: for visa_type, loc_sess_lst in old_session.items(): for loc, sess_lst in loc_sess_lst.items(): self.session[visa_type][loc] = [ Session(**session) for session in sess_lst ] self.session_idx[visa_type][ loc] = 0 # set currently used index to 0 self.session, self.session_idx = self.inititae_session_cache( sys, self.session, self.session_idx) self.save()
def add_session(): while True: visa_type, place, replace = replace_items.get() # check if replaced if replace: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if not replace in session_list[visa_type][place]: continue logger.info("Update session " + replace) try: cracker = g.value("cracker", None) username, passwd, sid = login(cracker, place) date = visa_select(visa_type, place, sid) if not date: continue try: session_list = g.value("session", {}) if not visa_type in session_list: session_list[visa_type] = {} if not place in session_list[visa_type]: session_list[visa_type][place] = [] if replace: idx = session_list[visa_type][place].index(replace) session_list[visa_type][place][idx] = sid else: session_list[visa_type][place].append(sid) session_file = g.value("session_file", "session.json") with open(session_file, "w") as f: f.write(json.dumps(session_list, ensure_ascii=False)) except: logger.error(traceback.format_exc()) except: logger.error(traceback.format_exc())
def save(self): """ Write the current session into disk.""" session_file = G.value('session_file', 'session.json') with G.LOCK: session_json = defaultdict(lambda: defaultdict(list)) for visa_type, loc_sess_dct in self.session.items(): for loc, sess_lst in loc_sess_dct.items(): session_json[visa_type][loc] = [ session.to_json() for session in sess_lst ] with open(session_file, 'w') as f: json.dump(dict(session_json), f, indent=4, ensure_ascii=False) self.logger.debug('Write session cache into disk: %s', session_file)
def set_session_pool_size(self, visa_type, place, size, ais=False): session_list = g.value("session", {}) if visa_type not in session_list: session_list[visa_type] = {} if place not in session_list[visa_type]: session_list[visa_type][place] = [] cnt = len(session_list[visa_type][place]) if cnt < size: for _ in range(size - cnt): rand_str = "".join( [chr(np.random.randint(26) + ord('a')) for _ in range(15)]) if ais: session_list[visa_type][place].append( ["placeholder_" + rand_str, "114514"]) else: session_list[visa_type][place].append("placeholder_" + rand_str) elif cnt > size: session_list[visa_type][place] = session_list[visa_type][ place][:size]
def add_session(): while True: visa_type, place, replace = replace_items.get() ais = "-" in place # check if replaced if replace: session_list = g.value("session", {}) if visa_type not in session_list: session_list[visa_type] = {} if place not in session_list[visa_type]: session_list[visa_type][place] = [] if ais and replace not in [ x[0] for x in session_list[visa_type][place] ]: continue if not ais and replace not in session_list[visa_type][place]: continue logger.info("Update session " + replace) try: if ais: endpoint = g.value( "crawler_node", "") + "/ais/register/?code=%s&email=%s&pswd=%s" % ( place, g.value("ais_email_" + visa_type, None), g.value("ais_pswd_" + visa_type, None)) else: endpoint = g.value("crawler_node", "") + \ "/register/?type=%s&place=%s" % (visa_type, place) r = requests.get(endpoint, timeout=40, proxies=g.value("proxies", None)) result = r.json() if ais: schedule_id = result["id"] date = 1 if len(result["msg"]) > 0 else None sid = result["session"] else: date = tuple(map(int, result["msg"].split("-"))) sid = result["session"] if not date: continue try: session_list = g.value("session", {}) if visa_type not in session_list: session_list[visa_type] = {} if place not in session_list[visa_type]: session_list[visa_type][place] = [] if replace: if ais: idx = [x[0] for x in session_list[visa_type][place] ].index(replace) else: idx = session_list[visa_type][place].index(replace) session_list[visa_type][place][idx] = ([sid, schedule_id] if ais else sid) else: session_list[visa_type][place].append( ([sid, schedule_id] if ais else sid)) session_file = g.value("session_file", "session.json") with open(session_file, "w") as f: f.write(json.dumps(session_list, ensure_ascii=False)) except: logger.error(traceback.format_exc()) except: logger.error(traceback.format_exc())
def visa_select(visa_type, place, sid): proxies = g.value("proxies", None) cookies = copy.deepcopy(g.COOKIES) cookies["sid"] = sid # select immigrant/nonimmigrant visa select_visa_type_uri = "https://cgifederal.secure.force.com/selectvisatype" r = requests.get(select_visa_type_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 1: bad status code', r.status_code) return None soup = bs(r.text, "html.parser") view_state = soup.find( id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find( id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find( id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find( id="com.salesforce.visualforce.ViewStateCSRF").get("value") data = { "j_id0:SiteTemplate:theForm": "j_id0:SiteTemplate:theForm", "j_id0:SiteTemplate:theForm:ttip": "Nonimmigrant Visa", # "j_id0:SiteTemplate:theForm:j_id176": "继续", "j_id0:SiteTemplate:theForm:j_id176": "Continue", "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_type_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 2: bad status code', r.status_code) return None # select place # if place != "香港" and place != "台北": select_post_uri = "https://cgifederal.secure.force.com/selectpost" r = requests.get(select_post_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 3: bad status code', r.status_code) return None soup = bs(r.text, "html.parser") view_state = soup.find( id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find( id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find( id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find( id="com.salesforce.visualforce.ViewStateCSRF").get("value") contact_id = soup.find( id="j_id0:SiteTemplate:j_id112:contactId").get("value") # NOTE: Place IDs are not unique; the first ID is simply the first one on the list place2id = { # "北京": "j_id0:SiteTemplate:j_id112:j_id165:0", # "成都": "j_id0:SiteTemplate:j_id112:j_id165:1", # "广州": "j_id0:SiteTemplate:j_id112:j_id165:2", # "上海": "j_id0:SiteTemplate:j_id112:j_id165:3", # "沈阳": "j_id0:SiteTemplate:j_id112:j_id165:4" "Melbourne": "j_id0:SiteTemplate:j_id112:j_id165:0", "Perth": "j_id0:SiteTemplate:j_id112:j_id165:1", "Sydney": "j_id0:SiteTemplate:j_id112:j_id165:2", } place_code = soup.find(id=place2id[place]).get("value") data = { "j_id0:SiteTemplate:j_id112": "j_id0:SiteTemplate:j_id112", "j_id0:SiteTemplate:j_id112:j_id165": place_code, # "j_id0:SiteTemplate:j_id112:j_id169": "继续", "j_id0:SiteTemplate:j_id112:j_id169": "Continue", "j_id0:SiteTemplate:j_id112:contactId": contact_id, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_post_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 4: bad status code', r.status_code) return None # select visa category select_visa_category_uri = "https://cgifederal.secure.force.com/selectvisacategory" r = requests.get(select_visa_category_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 5: bad status code', r.status_code) return None soup = bs(r.text, "html.parser") view_state = soup.find( id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find( id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find( id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find( id="com.salesforce.visualforce.ViewStateCSRF").get("value") contact_id = soup.find( id="j_id0:SiteTemplate:j_id109:contactId").get("value") prefix = "j_id0:SiteTemplate:j_id109:j_id162:" category2id = { # "B": {"北京": 0, "成都": 0, "广州": 0, "上海": 0, "沈阳": 0, "香港": 1, "台北": 1}, # "F": {"北京": 1, "成都": 1, "广州": 1, "上海": 1, "沈阳": 1, "香港": 0, "台北": 0}, # "O": {"北京": 4, "成都": 2, "广州": 3, "上海": 4, "沈阳": 2, "香港": 3, "台北": 3}, # "H": {"北京": 2, "广州": 3, "上海": 2, "香港": 3, "台北": 3}, # "L": {"北京": 3, "广州": 2, "上海": 3, "香港": 3, "台北": 3} "E": { "Melbourne": 5, "Perth": 0, "Sydney": 3 }, "F": { "Melbourne": 1, "Perth": 0, "Sydney": 0 }, } category_code = soup.find(id=prefix + str(category2id[visa_type][place])).get("value") data = { "j_id0:SiteTemplate:j_id109": "j_id0:SiteTemplate:j_id109", "j_id0:SiteTemplate:j_id109:j_id162": category_code, # "j_id0:SiteTemplate:j_id109:j_id166": "继续", "j_id0:SiteTemplate:j_id109:j_id166": "Continue", "j_id0:SiteTemplate:j_id109:contactId": contact_id, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_category_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 6: bad status code', r.status_code) return None # select visa type select_visa_code_uri = "https://cgifederal.secure.force.com/selectvisacode" r = requests.get(select_visa_code_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 7: bad status code', r.status_code) return None soup = bs(r.text, "html.parser") view_state = soup.find( id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find( id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find( id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find( id="com.salesforce.visualforce.ViewStateCSRF").get("value") # Indices of the list of 'selectedVisaClass' values if place == 'Sydney': F_typecode = 0 E_typecode = -2 elif place == 'Perth': F_typecode = 3 E_typecode = -2 elif place == 'Melbourne': E_typecode = 0 F_typecode = 0 else: print('visa_select 8: unsupported place', place) return None type2id = {"F": F_typecode, "E": E_typecode} inputs = soup.find_all("input") type_codes = [ x.get("value") for x in inputs if x.get("name") == "selectedVisaClass" ] type_code = type_codes[type2id[visa_type]] data = { "j_id0:SiteTemplate:theForm": "j_id0:SiteTemplate:theForm", # "j_id0:SiteTemplate:theForm:j_id178": "继续", "j_id0:SiteTemplate:theForm:j_id178": "Continue", "selectedVisaClass": type_code, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_code_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 9: bad status code', r.status_code) return None # update data update_data_uri = "https://cgifederal.secure.force.com/updatedata" r = requests.get(update_data_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: print('visa_select 10: bad status code', r.status_code) return None date = get_date(r.text) logger.info("%s, %s, SUCCESS_N, %s" % (visa_type, place, date)) if date: g.assign("status_%s_%s" % (visa_type, place), date) return date
def visa_select(visa_type, place, sid): proxies = g.value("proxies", None) cookies = copy.deepcopy(g.COOKIES) cookies["sid"] = sid # select immigrant/nonimmigrant visa select_visa_type_uri = "https://cgifederal.secure.force.com/selectvisatype" r = requests.get(select_visa_type_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: return None soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find(id="com.salesforce.visualforce.ViewStateCSRF").get("value") data = { "j_id0:SiteTemplate:theForm": "j_id0:SiteTemplate:theForm", "j_id0:SiteTemplate:theForm:ttip": "Nonimmigrant Visa", "j_id0:SiteTemplate:theForm:j_id176": "继续", "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_type_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None # select place if place != "香港": select_post_uri = "https://cgifederal.secure.force.com/selectpost" r = requests.get(select_post_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: return None soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find(id="com.salesforce.visualforce.ViewStateCSRF").get("value") contact_id = soup.find(id="j_id0:SiteTemplate:j_id112:contactId").get("value") place2id = { "北京": "j_id0:SiteTemplate:j_id112:j_id165:0", "成都": "j_id0:SiteTemplate:j_id112:j_id165:1", "广州": "j_id0:SiteTemplate:j_id112:j_id165:2", "上海": "j_id0:SiteTemplate:j_id112:j_id165:3", "沈阳": "j_id0:SiteTemplate:j_id112:j_id165:4" } place_code = soup.find(id=place2id[place]).get("value") data = { "j_id0:SiteTemplate:j_id112": "j_id0:SiteTemplate:j_id112", "j_id0:SiteTemplate:j_id112:j_id165": place_code, "j_id0:SiteTemplate:j_id112:j_id169": "继续", "j_id0:SiteTemplate:j_id112:contactId": contact_id, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_post_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None # select visa category select_visa_category_uri = "https://cgifederal.secure.force.com/selectvisacategory" r = requests.get(select_visa_category_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: return None soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find(id="com.salesforce.visualforce.ViewStateCSRF").get("value") contact_id = soup.find(id="j_id0:SiteTemplate:j_id109:contactId").get("value") prefix = "j_id0:SiteTemplate:j_id109:j_id162:" category2id = { "B": {"北京": 0, "成都": 0, "广州": 0, "上海": 0, "沈阳": 0, "香港": 0}, "F": {"北京": 1, "成都": 1, "广州": 1, "上海": 1, "沈阳": 1, "香港": 1}, "O": {"北京": 4, "成都": 2, "广州": 3, "上海": 4, "沈阳": 2, "香港": 3}, "H": {"北京": 2, "广州": 3, "上海": 2, "香港": 3}, "L": {"北京": 3, "广州": 2, "上海": 3, "香港": 3} } category_code = soup.find(id=prefix + str(category2id[visa_type][place])).get("value") data = { "j_id0:SiteTemplate:j_id109": "j_id0:SiteTemplate:j_id109", "j_id0:SiteTemplate:j_id109:j_id162": category_code, "j_id0:SiteTemplate:j_id109:j_id166": "继续", "j_id0:SiteTemplate:j_id109:contactId": contact_id, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_category_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None # select visa type select_visa_code_uri = "https://cgifederal.secure.force.com/selectvisacode" r = requests.get(select_visa_code_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: return None soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") view_state_csrf = soup.find(id="com.salesforce.visualforce.ViewStateCSRF").get("value") type2id = { "F": 0, "B": 2, "H": 0, "O": 11 if place == "香港" else (7 if place == "广州" else 0), "L": 8 if place == "香港" else 2 } inputs = soup.find_all("input") type_codes = [x.get("value") for x in inputs if x.get("name") == "selectedVisaClass"] type_code = type_codes[type2id[visa_type]] data = { "j_id0:SiteTemplate:theForm": "j_id0:SiteTemplate:theForm", "j_id0:SiteTemplate:theForm:j_id178": "继续", "selectedVisaClass": type_code, "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "com.salesforce.visualforce.ViewStateCSRF": view_state_csrf } r = requests.post(select_visa_code_uri, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None # update data update_data_uri = "https://cgifederal.secure.force.com/updatedata" r = requests.get(select_visa_code_uri, cookies=cookies, proxies=proxies) if r.status_code != 200: return None date = get_date(r.text) logger.info("%s, %s, SUCCESS_N, %s" % (visa_type, place, date)) if date: g.assign("status_%s_%s" % (visa_type, place), date) return date
def login(cracker, place): proxies = g.value("proxies", None) # get register page REG_URI = "https://cgifederal.secure.force.com/SiteRegister?country=China&language=zh_CN" REG_HK_URI = "https://cgifederal.secure.force.com/SiteRegister?country=Hong%20Kong&language=zh_CN" r = requests.get(REG_HK_URI if place == "香港" else REG_URI, proxies=proxies) if r.status_code != 200: return None # In case of failure while True: soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") cookies = r.cookies # get recaptcha REG_CAPTCHA_URI = "https://cgifederal.secure.force.com/SiteRegister?refURL=https%3A%2F%2Fcgifederal.secure.force.com%2F%3Flanguage%3DChinese%2520%28Simplified%29%26country%3DChina" REG_CAPTCHA_HK_URI = "https://cgifederal.secure.force.com/SiteRegister?refURL=https%3A%2F%2Fcgifederal.secure.force.com%2F%3Flanguage%3DChinese%2520%28Simplified%29%26country%3DHong%20Kong" data = { "AJAXREQUEST": "_viewRoot", "Registration:SiteTemplate:theForm": "Registration:SiteTemplate:theForm", "Registration:SiteTemplate:theForm:username": "", "Registration:SiteTemplate:theForm:firstname": "", "Registration:SiteTemplate:theForm:lastname": "", "Registration:SiteTemplate:theForm:password": "", "Registration:SiteTemplate:theForm:confirmPassword": "", "Registration:SiteTemplate:theForm:response": "", "Registration:SiteTemplate:theForm:recaptcha_response_field": "", "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac, "Registration:SiteTemplate:theForm:j_id177": "Registration:SiteTemplate:theForm:j_id177" } r = requests.post(REG_CAPTCHA_HK_URI if place == "香港" else REG_CAPTCHA_URI, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None soup = bs(r.text, "html.parser") view_state = soup.find(id="com.salesforce.visualforce.ViewState").get("value") view_state_version = soup.find(id="com.salesforce.visualforce.ViewStateVersion").get("value") view_state_mac = soup.find(id="com.salesforce.visualforce.ViewStateMAC").get("value") cookies = r.cookies raw = soup.find_all(id='Registration:SiteTemplate:theForm:theId') raw = raw[0].attrs['src'].replace('data:image;base64,', '') img = base64.b64decode(raw) gifname = 'try.gif' open(gifname, 'wb').write(img) open('gifname', 'w').write(gifname) captcha = cracker.solve(img).replace('1', 'l').lower() if len(captcha) == 0: open('state', 'w').write( '自动识别服务挂掉了,请到<a href="https://github.com/Trinkle23897/' 'us-visa">GitHub</a>上提issue') return None # click and register username = ''.join([chr(np.random.randint(26) + ord('a')) for _ in range(15)]) + "@gmail.com" passwd = ''.join(np.random.permutation(' '.join('12345qwert').split())) data = { "Registration:SiteTemplate:theForm": "Registration:SiteTemplate:theForm", "Registration:SiteTemplate:theForm:username": username, "Registration:SiteTemplate:theForm:firstname": "Langpu", "Registration:SiteTemplate:theForm:lastname": "Te", "Registration:SiteTemplate:theForm:password": passwd, "Registration:SiteTemplate:theForm:confirmPassword": passwd, "Registration:SiteTemplate:theForm:j_id169": "on", "Registration:SiteTemplate:theForm:response": captcha, "Registration:SiteTemplate:theForm:recaptcha_response_field": "", "Registration:SiteTemplate:theForm:submit": "提交", "com.salesforce.visualforce.ViewState": view_state, "com.salesforce.visualforce.ViewStateVersion": view_state_version, "com.salesforce.visualforce.ViewStateMAC": view_state_mac } r = requests.post(REG_CAPTCHA_HK_URI if place == "香港" else REG_CAPTCHA_URI, data=data, cookies=cookies, proxies=proxies) if r.status_code != 200: return None front_door_uri = r.text.split("'")[-2] if front_door_uri.startswith("https"): break else: if '无法核实验证码' not in r.text: os.system('mv %s log/%s.gif' % (gifname, captcha)) else: if not os.path.exists('fail'): os.makedirs('fail') os.system('mv %s fail/%s.gif' % (gifname, captcha)) if hasattr(cracker, 'wrong'): cracker.wrong() # open front door r = requests.get(front_door_uri, cookies=cookies, proxies=proxies) cookies = r.cookies return username, passwd, cookies["sid"]
def crawler(visa_type, places): open(visa_type + '_state', 'w').write('1') localtime = time.localtime() s = {'time': time.strftime('%Y/%m/%d %H:%M', localtime)} second = localtime.tm_sec cur = time.strftime('%Y/%m/%d', time.localtime()) for place in places: try: # prepare session sess = session_op.get_session(visa_type, place) if not sess: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "No Session")) continue cookies = copy.deepcopy(g.COOKIES) cookies["sid"] = sess # send request r = requests.get(g.HOME_URI, headers=g.HEADERS, cookies=cookies, proxies=g.value("proxies", None)) if r.status_code != 200: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) continue # parse HTML page = r.text date = get_date(page) if not date: logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Session Expired")) session_op.replace_session(visa_type, place, sess) continue elif date == (0, 0, 0): logger.warning("%s, %s, FAILED, %s" % (visa_type, place, "Date Not Found")) last_status = g.value("status_%s_%s" % (visa_type, place), (0, 0, 0)) if last_status != (0, 0, 0): session_op.replace_session(visa_type, place, sess) elif random.random() < 0.05: session_op.replace_session(visa_type, place, sess) continue logger.info("%s, %s, SUCCESS, %s" % (visa_type, place, date)) g.assign("status_%s_%s" % (visa_type, place), date) except: logger.error(traceback.format_exc()) # write to file for place in places: n = place + '-' + cur n2 = place + '2-' + cur y, m, d = g.value("status_%s_%s" % (visa_type, place), (0, 0, 0)) s[n] = s[n2] = '{}/{}/{}'.format(y, m, d) if y > 0 else "/" if s[n] != '/': path = visa_type + '/' + n.replace('-', '/') os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True) open(path, 'a+').write(s['time'].split(' ')[-1] + ' ' + s[n] + '\n') merge( '../visa/visa.json' if visa_type == "F" else '../visa/visa-%s.json' % visa_type.lower(), s, cur) open(visa_type + '_state', 'w').write('0') os.system('python3 notify.py --type ' + visa_type + ' &')
def fetch_visa_status(cls, visa_type: str, location: str, req: requests.Session): """ Fetch the latest visa status available from crawler server.""" now = datetime.now().strftime('%H:%M:%S') try: session = SESSION_CACHE.get_session(visa_type, location) if session is None: LOGGER.warning('%s, %s, %s, FAILED - No Session', now, visa_type, location) return if session.sys == 'ais': endpoint = G.CRAWLER_API['refresh']['ais'].format( location, session.schedule_id, session.session) elif session.sys == 'cgi': endpoint = G.CRAWLER_API['refresh']['cgi'].format( session.session) url = '{}{}'.format(G.value('current_crawler_node', ''), endpoint) try: res = req.get(url, timeout=G.WAIT_TIME['refresh'], proxies=G.value('proxies', None)) except requests.exceptions.Timeout: LOGGER.warning('%s, %s, %s, FAILED - Endpoint Timeout.', now, visa_type, location) cls.save_placeholder_at_exception(visa_type, location) cls.check_crawler_server_connection() return except requests.exceptions.ConnectionError: LOGGER.warning( '%s, %s, %s, FAILED - Endpoint Connection Aborted.', now, visa_type, location) cls.check_crawler_server_connection() return else: if res.status_code != 200: LOGGER.warning('%s, %s, %s, FAILED - %d', now, visa_type, location, res.status_code) cls.check_crawler_server_connection() return result = res.json() LOGGER.debug( 'fetch_visa_status - Endpoint: %s | Response json: %s', endpoint, json.dumps(result)) if result[ 'code'] != 0: # code == 0 stands for success in crawler api code LOGGER.warning('%s, %s, %s, FAILED - Session Expired', now, visa_type, location) # session expired will trigger database update using the last successful fetch result cls.save_placeholder_at_exception(visa_type, location) SESSION_CACHE.produce_new_session_request( visa_type, location, session) return if session.sys == 'cgi': dt_segments = [ int(dt_seg) for dt_seg in result['msg'].split('-') ] cls.save_fetched_data(visa_type, location, dt_segments) LOGGER.info('%s, %s, %s, SUCCESS - %d/%d/%d', now, visa_type, location, *dt_segments) elif session.sys == 'ais': date_lst = result['msg'] for city, dt_segments in date_lst: if city in G.AIS_MONITORING_CITY: cls.save_fetched_data(visa_type, city, dt_segments) LOGGER.info('%s, %s, %s, %s, SUCCESS - %d/%d/%d', now, visa_type, location, city, *dt_segments) new_session = Session(session=(result['session'], session.schedule_id), sys=session.sys) SESSION_CACHE.replace_session(visa_type, location, session, new_session) except Exception: LOGGER.error(traceback.format_exc())
def consume_new_session_request(cls, task_queue: Queue = G.SESSION_UPDATE_QUEUE ): """ Consume the session update event in the task queue to request new session from crawler server. """ LOGGER.info('Listening to session update request task queue...') while True: visa_type, location, session = task_queue.get() LOGGER.debug( 'Receive new session update request: %s-%s | Current queue size: %d', visa_type, location, task_queue.qsize()) if session is None: LOGGER.error('A session object from %s-%s is NoneType', visa_type, location) # just in case if not SESSION_CACHE.contain_session(visa_type, location, session): LOGGER.debug( 'Session %s is no longer in the %s-%s session list.', session, visa_type, location) continue try: if session.sys == 'ais': email = G.value(f'ais_email_{visa_type}', None) password = G.value(f'ais_pswd_{visa_type}', None) LOGGER.debug('Fetching new session for AIS: %s, %s, %s', location, email, password) endpoint = G.CRAWLER_API['register']['ais'].format( location, email, password) if email is None or password is None: continue elif session.sys == 'cgi': endpoint = G.CRAWLER_API['register']['cgi'].format( visa_type, location) url = '{}{}'.format(G.value('current_crawler_node', ''), endpoint) res = requests.get(url, timeout=G.WAIT_TIME['register'], proxies=G.value('proxies', None)) try: result = res.json() except ValueError: content = res.content.decode() if 'Server Error (500)' in content: SESSION_CACHE.mark_unavailable(visa_type, location) else: print(time.asctime(), visa_type, location, content) continue LOGGER.debug( 'consume_new_session_request - Endpoint: %s | Response json: %s', endpoint, json.dumps(result)) if result['code'] != 0: LOGGER.warning('%s, %s, %s, FAILED - %s', datetime.now().strftime('%H:%M:%S'), visa_type, location, result['msg']) if result['msg'] == "Network Error": SESSION_CACHE.mark_unavailable(visa_type, location) else: cls.check_crawler_server_connection() continue # Generate new session object and update cache if session.sys == 'ais': new_session = Session((result['session'], result['id']), sys='ais') date_available = bool(len(result['msg'])) elif session.sys == 'cgi': new_session = Session(result['session'], sys='cgi') date_available = bool( tuple([dt_seg for dt_seg in result['msg'].split('-') ])) # Always True if date_available: # why this flag is needed? LOGGER.info( 'consume_new_session_request - %s, %s, %s, SUCCESS - %s', datetime.now().strftime('%H:%M:%S'), visa_type, location, result['msg']) SESSION_CACHE.replace_session(visa_type, location, session, new_session) except requests.exceptions.ReadTimeout: LOGGER.debug( 'consume_new_session_request - request time out for endpoint: %s | %s-%s', endpoint, visa_type, location) cls.check_crawler_server_connection() except Exception: LOGGER.error('an unexpected error occured', traceback.format_exc())