Example #1
0
 def parse(self):
     data = mylib.myurl(self.url)
     data[0] = data[0].split("=")[1].strip()
     data[-1] = data[-1].strip(";")
     data_str = "".join(data)
     data_image =  data[1].split(":")[0].strip("\"")
     ret =  urlparse.urljoin(self.url, data_image)
     self.write(ret)
Example #2
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret = ""
     flag = False
     for line in data:
         if "divList" in line:
             ret = line
             break
     self.write(ret)
Example #3
0
 def parse(self, write=True):
     cxt = Context(self.url)
     data = mylib.myurl(self.url)
     for line in data:
         cxt.do(line)
     
     if write:
         self.write(cxt.result())
     else:
         return cxt.result()
Example #4
0
 def parse(self):
     data = mylib.myurl(self.url)
     image_line = ""
     for line in data:
         if "USEMAP" in line:
             image_line = line
             break
     image_str = image_line.split()[1].split("=")[1].strip("\"")
     ret = urlparse.urljoin(self.url, image_str)
     self.write(ret)
Example #5
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret = ""
     for line in data:
         m = re.search("src=.+</iframe>", line)
         if m != None:
             url = m.group(0).split("?")[0].strip("src=")
             ret = urlparse.urljoin(self.url, url)
             break
     self.write(ret)
Example #6
0
def retryUrl(url):
    cnt = 0
    while cnt < 3:
        try:
            data = mylib.myurl(url)
            return data
        except:
            print "retry:", cnt
            pass
        cnt += 1
    return []
Example #7
0
 def parse(self):
     data = mylib.myurl(self.url)
     data_ret = ""
     contents = []
     for line in data:
         if "galleries-slide-sub-title1" in line:
             concert = {}
             concert["url_content"] = self.extractURL(line)
             self._parse_content(concert["url_content"], concert)
             contents.append(copy.deepcopy(concert))
     self.write(contents)
Example #8
0
 def parse(self):
     data = mylib.myurl(self.url)
     data_ret = ""
     contents = []
     for line in data:
         if "galleries-slide-sub-title1" in line:
             concert = {}
             concert['url_content'] = self.extractURL(line)
             self._parse_content(concert['url_content'], concert)
             contents.append(copy.deepcopy(concert))
     self.write(contents)
Example #9
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret = ""
     flag = False
     for line in data:
         if "<center>" in line or flag == True:
             ret += line
             flag = True
         if "</center>" in line:
             ret += line
             break
     self.write(ret)
Example #10
0
    def parse(self):
        data = mylib.myurl(self.url)
        s = 0
        for i in range(len(data)):
            if "BoxTable" in data[i]:
                s = i
                break

        data_clean = data[s:]
        for line in data_clean:
            print line

        self.write(data_clean)
Example #11
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret = ""
     flag = False
     for line in data:
         if "<table" in line:
             ret += line
             flag = True
         if "</table>" in line:
             ret += line
             break
         if flag:
             ret += line
     print ret
     self.write(ret)
Example #12
0
    def parse(self):
        data = mylib.myurl(self.url)
        data_ret = ""
        for line in data:
            if "shows_list" in line:
                data_ret = line
                break

        contents = self.extractPoster(data_ret) 
        self.extractPrice(data_ret, contents)
        self.extractDate(data_ret, contents)
        if len(contents) > 0:
            self.write(contents)
            return 0
        else:
            return 1
Example #13
0
    def parse(self):
        data = mylib.myurl(self.url)
        data_ret = ""
        for line in data:
            if "shows_list" in line:
                data_ret = line
                break

        contents = self.extractPoster(data_ret)
        self.extractPrice(data_ret, contents)
        self.extractDate(data_ret, contents)
        if len(contents) > 0:
            self.write(contents)
            return 0
        else:
            return 1
Example #14
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret = ""
     flag = False
     for line in data:
         line = line.decode("big5")
         line = line.encode("utf=8")
         if "<table" in line:
             ret += line
             flag = True
         if flag:
             ret += line
         
         if "</table>" in line:
             break
     self.write(ret)
Example #15
0
    def parse(self):
        data = mylib.myurl(self.url)
        flag = False
        url_map = {}
        for line in data:
            if "News_NewsList" in line:
                title = re.search("target=\"\">.+</a>", line)
                key = title.group(0).split(">")[1].rstrip("</a>")
                m = re.search("href=\".+\"", line)
                if m != None:
                    target_url = self.host + m.group(0).split()[0].lstrip("href=\"").rstrip("\"")
                    url_map[key] = [target_url.replace("amp;","")]
        for key in url_map.keys():
            kgs = kmdn_gov_sub.Parser({"url" : url_map[key][0]})
            time.sleep(random.randint(3,5))
            url_map[key].append(kgs.parse(False))

        self.write(url_map)
Example #16
0
    def _parse_content(self, url, content):
        print url
        data = mylib.myurl(url)
        data_ret = ""
        place = 0
        price = 0
        start_date = 0
        start_time = 0
        for line in data:
            if "<title>" in line:
                searchObj = re.search(r'<title>(.*?)</title>', line,
                                      re.M | re.I | re.S)
                if searchObj:
                    content['title'] = searchObj.group(1)
            if "alignnone" in line:
                searchObj = re.search(r'src="(.*?)"', line, re.M | re.I | re.S)
                if searchObj:
                    content['url_image'] = searchObj.group(1)
                    content['image_id'] = searchObj.group(1).split("/")[-1]

            if place == 1 and "</p>" in line:
                content['place'] = line.strip().rstrip("</p>")
                place = 0
            if price == 1 and '</p>' in line:
                content['price'] = line.strip().rstrip("</p>")
                price = 0
            if start_date == 1 and '</p>' in line:
                content['start_date'] = line.strip().rstrip("</p>")
                start_date = 0
            if start_time == 1 and '</p>' in line:
                content['start_time'] = line.strip().rstrip("</p>")
                start_time = 0
            if "演出場地" in line:
                place = 1
            if "演出票價" in line:
                price = 1
            if "演出日期" in line:
                start_date = 1
            if "演出開始" in line:
                start_time = 1
Example #17
0
    def _parse_content(self, url, content):
        print url
        data = mylib.myurl(url)
        data_ret = ""
        place = 0
        price = 0
        start_date = 0
        start_time = 0
        for line in data:
            if "<title>" in line:
                searchObj = re.search(r"<title>(.*?)</title>", line, re.M | re.I | re.S)
                if searchObj:
                    content["title"] = searchObj.group(1)
            if "alignnone" in line:
                searchObj = re.search(r'src="(.*?)"', line, re.M | re.I | re.S)
                if searchObj:
                    content["url_image"] = searchObj.group(1)
                    content["image_id"] = searchObj.group(1).split("/")[-1]

            if place == 1 and "</p>" in line:
                content["place"] = line.strip().rstrip("</p>")
                place = 0
            if price == 1 and "</p>" in line:
                content["price"] = line.strip().rstrip("</p>")
                price = 0
            if start_date == 1 and "</p>" in line:
                content["start_date"] = line.strip().rstrip("</p>")
                start_date = 0
            if start_time == 1 and "</p>" in line:
                content["start_time"] = line.strip().rstrip("</p>")
                start_time = 0
            if "演出場地" in line:
                place = 1
            if "演出票價" in line:
                price = 1
            if "演出日期" in line:
                start_date = 1
            if "演出開始" in line:
                start_time = 1
Example #18
0
 def parse(self):
     data = mylib.myurl(self.url)
     ret_list = []
     inflag = False
     ret = ""
     for line in data:
         """
         if "rowspan=\"2\"" in line:
             pass
             #print line.split(">")[1].split("<")[0].decode("utf8")
         """
         if "<tbody><tr>" in line:
             inflag = True
             ret += line
         if "row 1 start" in line or "</table>" in line:
             ret += line
             ret_list.append(ret)
             ret = ""
         elif inflag:
             ret += line
     head = ret_list[0].split("<tbody>")[2]
     kmdn_data = ret_list[-3].replace("../../", "http://www.cwb.gov.tw//V7/")
     ret = [head, kmdn_data]
     self.write(ret)