class Crawl_Proxy(object): def __init__(self): self.source_url = CYBERSYNDROME self.mongo = MongodbAPI() def start(self): data = self.mongo.Get_Data_From("proxy", {'_id': 0}) if data is not None and datetime.now() - timedelta( hours=3) < data["update_date"] and len(data["iptable"]) > 250: logging.info("Use old proxies") return logging.info("start crawl proxy") self.mongo.DropAll("proxy") proxy_ip = self.paresrHTML() self.mongo.Insert_Data_To("proxy", { "_id": 0, "iptable": proxy_ip, "update_date": datetime.now() }) logging.info("add %04d ip" % (len(proxy_ip))) def paresrHTML(self): p = HtmlRequests() tree = p.get_html_noproxy(self.source_url) _as = [] _ps = [] if tree == None: return for i in tree.xpath('//div[@id="content"]/script/text()'): result = re.findall('\[[0-9 ,]*\]', i) _as = result[0].replace("[", '').replace("]", '') _ps = result[1].replace("[", '').replace("]", '') _as_list = [x for x in _as.split(',')] _ps_list = [x for x in _ps.split(',')] arithmetic = re.findall('\(.*?\)%\d*', i) n = self.decode(_ps_list, arithmetic[0]) _as = _as_list[n:] + _as_list[0:n] break headerlist = [] for i in tree.xpath('//tr'): headers = {} for j in i.xpath('td[6]/text()'): tmp = j.split(":") headers[tmp[0]] = tmp[1] headerlist.append(headers) return self.getproxy(_as, _ps_list, headerlist) def decode(self, ps, string): divisor = string.split(')')[1].replace('%', '') dividend = string.split(')')[0].replace('(', '') num = 0 for i in dividend.split('+'): if "*" in i: mult = 1 for k in i.split('*'): if "ps" in k: count = int(re.search('\d+', k).group(0)) mult *= int(ps[count]) else: mult *= int(k) num += mult else: if "ps" in i: count = int(re.search('\d+', i).group(0)) num += int(ps[count]) else: num += int(i) return num % int(divisor) def getproxy(self, _as, _ps, headerlist): proxy_ip = [] j = 0 ip = "" for i in range(len(_as)): if i % 4 == 3: ip += _as[i] proxy_ip.append({ 'ip': { 'http': ip + ':' + _ps[j] }, 'headers': headerlist[j] }) j += 1 ip = "" continue ip += _as[i] + '.' return proxy_ip
class TWSE_realtime(): def __init__(self, stock_num): self.mongo = MongodbAPI() self.stock_num = stock_num self.htmlreq = HtmlRequests() self.req = self.htmlreq.get_session(SESSIONURL) now = datetime.now() self.stop_date = datetime(now.year, now.month, now.day, 13, 30, 10) def start(self): self.crawl() def crawl(self): now = datetime.now() if now < self.stop_date: threading.Timer(5.0, self.crawl).start() now_time = int(time.time()) * 1000 source_url = TWSEREALTIMEURL.format( stock_num=self.stock_num, time=now_time) json_data = self.htmlreq.get_json(self.req, source_url) data = self.parser(json_data) if data == None: return e = self.mongo.CheckExists('Realtime_data', data.get('_id', None)) if e == False: for i in range(5): err = self.mongo.Insert_Data_To("Realtime_data", data) if err: logging.info("Insert realtime data to mongo, id:%s" % (data.get("_id"))) return else: logging.error( "Fail to insert realtime data to mongo, id:%s" % (data.get("_id"))) def parser(self, j: json): # Process best result if len(j['msgArray']) == 0: return None data = j['msgArray'][0] def _split_best(d): if d: return d.strip('_').split('_') return d time = datetime.fromtimestamp( int(data['tlong']) / 1000).strftime('%Y-%m-%d %H:%M:%S') date = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') return { "_id": str(self.stock_num) + "@"+time, "code": self.stock_num, 'ts': int(time.mktime(date.timetuple())), "time": date, "latest_trade_price": float(data.get('z', None)), "trade_volume": float(data.get('tv', None)), "accumulate_trade_volume": float(data.get('v', None)), "best_bid_price": [float(x) for x in _split_best(data.get('b', None))], "best_bid_volume": [float(x) for x in _split_best(data.get('g', None))], "best_ask_price": [float(x) for x in _split_best(data.get('a', None))], "best_ask_volume": [float(x) for x in _split_best(data.get('f', None))], "open": float(data.get('o', None)), "high": float(data.get('h', None)), "low": float(data.get('l', None)) }