def _fetch(self): result_list = [] html_data = self.http.Get(self.fetch_url) html = html_data.decode(self.charset) item_infos = util.reg_helper(html,self.reg_str) if len(item_infos) == 0: results = util.reg_helper(html,r'captcha\.html\?ts=(?P<random_id>[^"]*)') if len(results) > 0: self.log.error("[%s] need security code", self.name) for item_info in item_infos: item = track_db.TrackData.GetInstance( [ -1, self.track_type, self.item_id, datetime.strptime(item_info[0], "%d/%m/%y %H:%M"), item_info[1].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), item_info[2].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), -1, None ] ) if 'delivered' in item.description.lower(): item.is_ended = 1 result_list.append(item) self._chk_new_items(result_list)
def _fetch(self): result_list = [] html_data = self.http.Get(self.fetch_url) html = html_data.decode('utf-8') list_by_day = util.reg_helper(html,self.reg_str_1) for day in list_by_day: date_str = util.reg_helper(day[0],self.reg_str_date)[0] date = datetime.strptime(date_str,"%A, %B %d, %Y ") # 'Friday, November 09, 2012 ' info_list = util.reg_helper(day[0],self.reg_str_tbodys) for info in info_list: item_info_list = util.reg_helper(info,self.reg_str__info) ti = datetime.strptime(item_info_list[3],"%M:%S") item = track_db.TrackData.GetInstance( [ -1, self.track_type, self.item_id, date + timedelta(seconds=ti.second, minutes=ti.minute), item_info_list[1].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), item_info_list[2].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), -1, None ] ) if 'delivered' in item.description.lower(): item.is_ended = 1 result_list.append(item) self._chk_new_items(result_list)
def _get_flv_location(self, uuid): param = self._get_param(uuid) url = r'http://ifenglive.soooner.com/?uuid=%s' % (param) data = self.http.get(url) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html,reg_str)[0] self.flv_location = url.replace("rtmp://", "http://") LOG.info("[flv] %s", self.flv_location) data = self.http.get(self.flv_location) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html, reg_str)[0] self.flv_location = self.flv_location.replace("rtmp://", "http://") return self.flv_location
def _fetch(self): result_list = [] html_data = self.http.Post(self.fetch_url,self.post_data) html = html_data.decode(self.charset_1) item_infos = util.reg_helper(html,self.reg_str) # get item info list from tags if len(item_infos) > 0: for item_info in item_infos: ti_str = item_info[1]+ " " + item_info[2] # '12/11/2012 15:31' item = track_db.TrackData.GetInstance( [ -1, self.track_type, self.item_id, datetime.strptime( ti_str, self.date_format ), item_info[0].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), item_info[3].replace('\n\r\t','').replace('\r','').replace('\t', '').strip(), -1, None ] ) if 'delivered' in item.description.lower(): item.is_ended = 1 result_list.append(item) self._chk_new_items(result_list)
def get_page_list(self): ht = http.HttpUtil(charset=CONFIG.charset, proxy=CONFIG.http_proxy) html = ht.get(CONFIG.url_indexpage).decode(CONFIG.charset) ids = util.reg_helper(html, CONFIG.regular_index) for id in ids: self.categries[int(id[0])] = id[1] self.dbpool.queue_work(dbutil.InsertCagegory(int(id[0]), id[1])) return self.categries
def get_channel_info(self): data = self.http.get(r'http://v.ifeng.com/live/js/scheduleurls.js?37') tmp = util.reg_helper(data,r'g_scheduelUrl\s=\s(?P<f>.*)}')[0] + '}' tmp = tmp.replace("\'","\"").decode(encoding="utf-8") js = json.loads(s=tmp, encoding="utf-8") for uuid, channel in js.items(): name = channel['name'] self.channels[name] = {'uuid': uuid, 'url': channel['url']} self.schedule_json = tmp return self.channels, self.schedule_json
def filter_process(self, reg_str=""): result = [] for data in self.datas: try: matches = reg_helper(data, reg_str) for match in matches: result.append(match) except Exception as e: self.log.exception(e) self.datas = result
def get_channel_info(self): data = self.__http.get(self.SCHEDULE_URL) tmp = util.reg_helper(data,r'g_scheduelUrl\s=\s(?P<f>.*)}')[0] + '}' tmp = tmp.replace("\'","\"").decode(encoding="utf-8") js = json.loads(s=tmp, encoding="utf-8") for uuid, channel in js.items(): name = channel['name'] self.__channels[name] = {'uuid': uuid, 'url': channel['url']} self.__schedule_json = tmp return self.__channels, self.__schedule_json
def _get_uuid(self,channel_name): self.get_channel_info() url = self.channels[channel_name]['url'] data = self.http.get(url) html = data.decode(CHARSET) if html.find(r'uuid=') > 0: reg_str = r'uuid=(?P<f>[^|]*)' else: reg_str = r'http://biz.vsdn.tv380.com/playlive.php\?(?P<f>[^|]*)' self.uuid = util.reg_helper(html,reg_str)[0] LOG.info("[UUID] %s", self.uuid) return self.uuid
def get_track_items_from_web(log=None): """ get list from "http://fastrackexpress.com.au/test.php?LIST=POST" and return ItemList """ from vavava.httpclient import HttpClient try: items = [] client=HttpClient() html=client.Get(r"http://fastrackexpress.com.au/test.php?LIST=POST") htmlStr=html.decode('utf-8') reg_str=r'<td>(?P<id>[^<]*)</td>' idList = util.reg_helper(htmlStr,reg_str) item = idList[1:] return item except Exception as e: if log: log.exception(e)
def getmatches(string): regstr = """机构名称:\s*([^\<]*)[^\<]*\</p>[^\>]+>机构地址:\s*([^\<]*)[^\<]*\</p>[^\>]+>区县:\s*([^\<]*)[^\<]*\</p>[^\>]+>机构电话:\s*([^\<]*)[^\<]*\</p>[^\>]+>基本医保点:\s*([^\<]*)[^\<]*\</p>[^\>]+>医保编码:\s*([^\<]*)[^\<]*\</p>[^\>]+>新农合定点:\s*([^\<]*)[^\<]*\</p>[^\>]+>邮政编码:\s*([^\<]*)[^\<]*\</p>[^\>]+>""" matches = util.reg_helper(string, regstr) return matches
def get_post_time(self, url): html = http.HttpUtil(proxy=CONFIG.http_proxy).get(url).decode(CONFIG.charset) reg = r'Posted:\s*(?P<tt>\d*[--|-]\d*[--|-]\d*[\s|@]\d*:\d*)' return util.reg_helper(html, reg)[0].replace('--', '-').replace('@', ' ')