def parse(self): data = mylib.myurl(self.url) data[0] = data[0].split("=")[1].strip() data[-1] = data[-1].strip(";") data_str = "".join(data) data_image = data[1].split(":")[0].strip("\"") ret = urlparse.urljoin(self.url, data_image) self.write(ret)
def parse(self): data = mylib.myurl(self.url) ret = "" flag = False for line in data: if "divList" in line: ret = line break self.write(ret)
def parse(self, write=True): cxt = Context(self.url) data = mylib.myurl(self.url) for line in data: cxt.do(line) if write: self.write(cxt.result()) else: return cxt.result()
def parse(self): data = mylib.myurl(self.url) image_line = "" for line in data: if "USEMAP" in line: image_line = line break image_str = image_line.split()[1].split("=")[1].strip("\"") ret = urlparse.urljoin(self.url, image_str) self.write(ret)
def parse(self): data = mylib.myurl(self.url) ret = "" for line in data: m = re.search("src=.+</iframe>", line) if m != None: url = m.group(0).split("?")[0].strip("src=") ret = urlparse.urljoin(self.url, url) break self.write(ret)
def retryUrl(url): cnt = 0 while cnt < 3: try: data = mylib.myurl(url) return data except: print "retry:", cnt pass cnt += 1 return []
def parse(self): data = mylib.myurl(self.url) data_ret = "" contents = [] for line in data: if "galleries-slide-sub-title1" in line: concert = {} concert["url_content"] = self.extractURL(line) self._parse_content(concert["url_content"], concert) contents.append(copy.deepcopy(concert)) self.write(contents)
def parse(self): data = mylib.myurl(self.url) data_ret = "" contents = [] for line in data: if "galleries-slide-sub-title1" in line: concert = {} concert['url_content'] = self.extractURL(line) self._parse_content(concert['url_content'], concert) contents.append(copy.deepcopy(concert)) self.write(contents)
def parse(self): data = mylib.myurl(self.url) ret = "" flag = False for line in data: if "<center>" in line or flag == True: ret += line flag = True if "</center>" in line: ret += line break self.write(ret)
def parse(self): data = mylib.myurl(self.url) s = 0 for i in range(len(data)): if "BoxTable" in data[i]: s = i break data_clean = data[s:] for line in data_clean: print line self.write(data_clean)
def parse(self): data = mylib.myurl(self.url) ret = "" flag = False for line in data: if "<table" in line: ret += line flag = True if "</table>" in line: ret += line break if flag: ret += line print ret self.write(ret)
def parse(self): data = mylib.myurl(self.url) data_ret = "" for line in data: if "shows_list" in line: data_ret = line break contents = self.extractPoster(data_ret) self.extractPrice(data_ret, contents) self.extractDate(data_ret, contents) if len(contents) > 0: self.write(contents) return 0 else: return 1
def parse(self): data = mylib.myurl(self.url) ret = "" flag = False for line in data: line = line.decode("big5") line = line.encode("utf=8") if "<table" in line: ret += line flag = True if flag: ret += line if "</table>" in line: break self.write(ret)
def parse(self): data = mylib.myurl(self.url) flag = False url_map = {} for line in data: if "News_NewsList" in line: title = re.search("target=\"\">.+</a>", line) key = title.group(0).split(">")[1].rstrip("</a>") m = re.search("href=\".+\"", line) if m != None: target_url = self.host + m.group(0).split()[0].lstrip("href=\"").rstrip("\"") url_map[key] = [target_url.replace("amp;","")] for key in url_map.keys(): kgs = kmdn_gov_sub.Parser({"url" : url_map[key][0]}) time.sleep(random.randint(3,5)) url_map[key].append(kgs.parse(False)) self.write(url_map)
def _parse_content(self, url, content): print url data = mylib.myurl(url) data_ret = "" place = 0 price = 0 start_date = 0 start_time = 0 for line in data: if "<title>" in line: searchObj = re.search(r'<title>(.*?)</title>', line, re.M | re.I | re.S) if searchObj: content['title'] = searchObj.group(1) if "alignnone" in line: searchObj = re.search(r'src="(.*?)"', line, re.M | re.I | re.S) if searchObj: content['url_image'] = searchObj.group(1) content['image_id'] = searchObj.group(1).split("/")[-1] if place == 1 and "</p>" in line: content['place'] = line.strip().rstrip("</p>") place = 0 if price == 1 and '</p>' in line: content['price'] = line.strip().rstrip("</p>") price = 0 if start_date == 1 and '</p>' in line: content['start_date'] = line.strip().rstrip("</p>") start_date = 0 if start_time == 1 and '</p>' in line: content['start_time'] = line.strip().rstrip("</p>") start_time = 0 if "演出場地" in line: place = 1 if "演出票價" in line: price = 1 if "演出日期" in line: start_date = 1 if "演出開始" in line: start_time = 1
def _parse_content(self, url, content): print url data = mylib.myurl(url) data_ret = "" place = 0 price = 0 start_date = 0 start_time = 0 for line in data: if "<title>" in line: searchObj = re.search(r"<title>(.*?)</title>", line, re.M | re.I | re.S) if searchObj: content["title"] = searchObj.group(1) if "alignnone" in line: searchObj = re.search(r'src="(.*?)"', line, re.M | re.I | re.S) if searchObj: content["url_image"] = searchObj.group(1) content["image_id"] = searchObj.group(1).split("/")[-1] if place == 1 and "</p>" in line: content["place"] = line.strip().rstrip("</p>") place = 0 if price == 1 and "</p>" in line: content["price"] = line.strip().rstrip("</p>") price = 0 if start_date == 1 and "</p>" in line: content["start_date"] = line.strip().rstrip("</p>") start_date = 0 if start_time == 1 and "</p>" in line: content["start_time"] = line.strip().rstrip("</p>") start_time = 0 if "演出場地" in line: place = 1 if "演出票價" in line: price = 1 if "演出日期" in line: start_date = 1 if "演出開始" in line: start_time = 1
def parse(self): data = mylib.myurl(self.url) ret_list = [] inflag = False ret = "" for line in data: """ if "rowspan=\"2\"" in line: pass #print line.split(">")[1].split("<")[0].decode("utf8") """ if "<tbody><tr>" in line: inflag = True ret += line if "row 1 start" in line or "</table>" in line: ret += line ret_list.append(ret) ret = "" elif inflag: ret += line head = ret_list[0].split("<tbody>")[2] kmdn_data = ret_list[-3].replace("../../", "http://www.cwb.gov.tw//V7/") ret = [head, kmdn_data] self.write(ret)