def downloadReq(self, view, download): """ Signal called on right click, save-something. """ uri = download.get_uri() self.opener.downloadFile(uri,"unknown","",type_of(uri),uri,uri)
def downloadReq(self, view, download): """ Signal called on right click, save-something. """ uri = download.get_uri() self.opener.downloadFile(uri, "unknown", "", type_of(uri), uri, uri)
def __init__(self, url, contentType, source): # Initialized each time... self.Redirect = "" # URL to redirect to. self.Title = "" self.HTML = "" # The description-html, top panel. self.itemId = "" # The specific item selected. self.singleItem = False self.NormalPage = False self.podcast = "" self.bgcolor = "" self.mediaItems = [] #List of items to place in Liststore. self.tabMatches = [] self.tabLinks = [] self.last_text = "" # prevent duplicates from shadow-text. self.url = url self.contentType = contentType self.source = source sttime = time.time() try: #parse as xml # Remove bad XML. See: # http://stackoverflow.com/questions/1016910/how-can-i-strip-invalid-xml-characters-from-strings-in-perl bad = "[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD]" self.source = re.sub(bad, " ", self.source) # now it should be valid xml. source_cleaned = self.source.replace('xmlns="http://www.apple.com/itms/"', '') #(this xmlns causes problems with xpath) dom = fromstring(source_cleaned) if dom.tag.find("html") > -1 or dom.tag == "{http://www.w3.org/2005/Atom}feed": # Don't want normal pages/atom pages, those are for the web browser! raise Exception elif dom.tag == "rss": # rss files are added self.HTML += "<p>This is a podcast feed, click Add to Podcast manager button on the toolbar to subscribe.</p>" items = dom.xpath("//item") logging.debug("rss: " + str(len(items))) for item in items: title = "" author = "" linkurl = "" duration = "" url = "" description = "" pubdate = "" for i in item: if i.tag == "title": title = i.text elif i.tag == "author" or i.tag.endswith("author"): author = i.text elif i.tag == "link": linkurl = i.text elif i.tag == "description": description = i.text elif i.tag == "pubDate": pubdate = i.text elif i.tag == "enclosure": url = i.get("url") elif i.tag.endswith("duration"): duration = i.text self.addItem(title, author, duration, type_of(url), description, pubdate, "", linkurl, url, "", "") else: self.seeXMLElement(dom) except Exception, e: logging.debug("ERR: " + str(e)) logging.debug("Parsing as HTML, not as XML.") ustart = self.source.find("<body onload=\"return open('") if ustart > -1: # This is a redirect-page. newU = self.source[ustart+27:self.source.find("'", ustart+27)] self.Redirect = newU logging.debug("Parsing HTML") self.HTML = self.source source_cleaned = self.source.replace('<html xmlns="http://www.apple.com/itms/"', '<html') dom = lxml.html.document_fromstring(source_cleaned) self.seeHTMLElement(dom)
def seeHTMLElement(self, element): if isinstance(element.tag, str): # normal element if (element.get("comparison") == "lt" or (element.get("comparison") and element.get("comparison").find("less") > -1)): return #Ignore child nodes. if element.tag == "tr" and element.get("dnd-clipboard-data"): data = json.loads(element.get("dnd-clipboard-data")) itemid = "" title = "" artist = "" duration = "" url = "" gotou = "" price = "0" comment = "" if ('itemName' in data): title = data['itemName'] if ('artistName' in data): artist = data['artistName'] if ('duration' in data): duration = time_convert(data['duration']) if ('preview-url' in data): url = data['preview-url'] if ('playlistName' in data): comment = data['playlistName'] if ('url' in data): gotou = data['url'] if ('price' in data): price = data['price'] if ('itemId' in data): itemid = data['itemId'] self.addItem(title, artist, duration, type_of(url), comment, "", "", gotou, url, price, itemid) elif (element.get("audio-preview-url") or element.get("video-preview-url")): if element.get("video-preview-url"): url = element.get("video-preview-url") else: url = element.get("audio-preview-url") title = "" if element.get("preview-title"): title = element.get("preview-title") author = "" if element.get("preview-artist"): author = element.get("preview-artist") duration = "" if element.get("preview-duration"): duration = time_convert(element.get("preview-duration")) logging.debug("preview-url adding row") self.addItem(title, author, duration, type_of(url), "", "", "", "", url, "", "") elif (element.tag == "button" and element.get("anonymous-download-url") and element.get("kind") and (element.get("title") or element.get("item-name"))):#Added for epub feature logging.debug("button row adding") title = "" artist = "" if element.get("title"): title = element.get("title") if element.get("item-name"): title = element.get("item-name") if element.get("preview-artist"): artist = element.get("preview-artist") self.addItem(title, artist, "", type_of(element.get("anonymous-download-url")), "", "", "", element.get("anonymous-download-url"), "", "", element.get("adam-id")) elif (element.tag == "button" and element.get("episode-url")): title = "" artist = "" url = "" itemid="" if element.get("aria-label"): title = element.get("aria-label") if title.startswith("Free Episode, "): title = title[14:] if element.get("artist-name"): artist = element.get("artist-name") if element.get("episode-url"): url = element.get("episode-url") mytype = type_of(url) if element.get("disabled") is not None: mytype = ".zip" # wrong ext. fix it. self.addItem(title, artist, "", mytype, "", "", "", "", url, "", itemid) else: # go through the childnodes. for i in element: self.seeHTMLElement(i)
class ParserBase(object): def __init__(self, url, contentType, source): # Initialized each time... self.Redirect = "" # URL to redirect to. self.Title = "" self.HTML = "" # The description-html, top panel. self.itemId = "" # The specific item selected. self.singleItem = False self.NormalPage = False self.podcast = "" self.bgcolor = "" self.mediaItems = [] #List of items to place in Liststore. self.tabMatches = [] self.tabLinks = [] self.last_text = "" # prevent duplicates from shadow-text. self.url = url self.contentType = contentType self.source = source sttime = time.time() try: #parse as xml # Remove bad XML. See: # http://stackoverflow.com/questions/1016910/how-can-i-strip-invalid-xml-characters-from-strings-in-perl bad = "[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD]" self.source = re.sub(bad, " ", self.source) # now it should be valid xml. source_cleaned = self.source.replace('xmlns="http://www.apple.com/itms/"', '') #(this xmlns causes problems with xpath) dom = fromstring(source_cleaned) if dom.tag.find("html") > -1 or dom.tag == "{http://www.w3.org/2005/Atom}feed": # Don't want normal pages/atom pages, those are for the web browser! raise Exception elif dom.tag == "rss": # rss files are added self.HTML += "<p>This is a podcast feed, click Add to Podcast manager button on the toolbar to subscribe.</p>" items = dom.xpath("//item") logging.debug("rss: " + str(len(items))) for item in items: title = "" author = "" linkurl = "" duration = "" url = "" description = "" pubdate = "" for i in item: if i.tag == "title": title = i.text elif i.tag == "author" or i.tag.endswith("author"): author = i.text elif i.tag == "link": linkurl = i.text elif i.tag == "description": description = i.text elif i.tag == "pubDate": pubdate = i.text elif i.tag == "enclosure": url = i.get("url") elif i.tag.endswith("duration"): duration = i.text self.addItem(title, author, duration, type_of(url), description, pubdate, "", linkurl, url, "", "") else: self.seeXMLElement(dom) except Exception, e: logging.debug("ERR: " + str(e)) logging.debug("Parsing as HTML, not as XML.") ustart = self.source.find("<body onload=\"return open('") if ustart > -1: # This is a redirect-page. newU = self.source[ustart+27:self.source.find("'", ustart+27)] self.Redirect = newU logging.debug("Parsing HTML") self.HTML = self.source source_cleaned = self.source.replace('<html xmlns="http://www.apple.com/itms/"', '<html') dom = lxml.html.document_fromstring(source_cleaned) self.seeHTMLElement(dom) items = [] arr = self.getItemsArray(dom) # get the tracks list element keys = dom.xpath("//key") # important parts of document! this is only calculated once to save time # Now get location path: # location description and links and last location in location bar. location = [] locationLinks = [] lastloc = "" locationelements = dom.xpath("//Path") if len(locationelements) > 0: for i in locationelements[0]: if (type(i).__name__ == '_Element' and i.tag == "PathElement"): location.append(i.get("displayName")) locationLinks.append(i.text) if location == ["iTunes U"]: section = dom.xpath("//HBoxView") # looking for first section with location info. if len(section) > 0: # may be out of range section = section[0] for i in section: if (type(i).__name__ == '_Element'): for j in i: if type(j).__name__ == '_Element' and j.tag == "GotoURL": location.append(j.text.strip()) locationLinks.append(j.get("url")) logging.debug(j.text.strip() + j.get("url")) lastloc = j.get("url") if self.textContent(section).find(">") > -1: section.getparent().remove(section) # redundant section > section ... info is removed. if arr is None: ks = dom.xpath("/Document/Protocol/plist/dict/array/dict") if len(ks): arr = ks logging.debug("Special end page after html link?" + str(len(ks))) if (len(ks) == 1 and dom.get("disableNavigation") == "true" and dom.get("disableHistory") == "true"): self.singleItem = True logging.debug("tag " + dom.tag) if arr is None: # No tracklisting. hasmedia = False if len(self.mediaItems) == 0: logging.debug("nothing here!") else: # add the tracks: # TODO: Add XML page's elements to the top panel, so the bottom panel isn't necessary. hasmedia = True # for each item... for i in arr: if type(i).__name__ == '_Element' and i.tag == "dict": # for each <dict> track info....</dict> get this information: name = "" artist = "" duration = "" comments = "" rtype = "" url = "" directurl = "" releaseDate = "" modifiedDate = "" id = "" for j in i: if j.tag == "key": # get each piece of data: if j.text in ["songName", "itemName"]: t = j.getnext().text if t: name = t elif j.text == "artistName": t = j.getnext().text if t: artist = t elif j.text == "duration": t = j.getnext().text if t: duration = t elif j.text in ["comments", "description", "longDescription"]: t = j.getnext().text if t: comments = t elif j.text == "url": t = j.getnext().text if t: url = t # Added Capital "URL", for the special case end page after html link. elif j.text in ["URL", "previewURL", "episodeURL", "preview-url"]: t = j.getnext().text if t: directurl = t elif j.text == "explicit": el = j.getnext() if el.text == "1": rtype = "[Explicit] " if el.text == "2": rtype = "[Clean] " elif j.text == "releaseDate": t = j.getnext().text if t: releaseDate = t elif j.text == "dateModified": t = j.getnext().text if t: modifiedDate = t elif j.text == "itemId": t = j.getnext().text if t: id = t elif j.text == "metadata": # for the special case end page after html link i.extend(j.getnext().getchildren()) # look inside this <dict><key></key><string></string>... also. self.addItem(name, artist, time_convert(duration), type_of(directurl), rtype + comments, self.formatTime(releaseDate), self.formatTime(modifiedDate), url, directurl, "", id) # Now put page details in the detail-box on top. if dom.tag == "rss": out = "" image = dom.xpath("/rss/channel/image/url") if len(image) > 0: # get recommended width, height: w, h = None, None try: w = dom.xpath("/rss/channel/image/width")[0].text h = dom.xpath("/rss/channel/image/height")[0].text except: pass self.HTML += self.imgText(image[0].text, h, w) #else: # TODO: fix this namespace problem #image = dom.xpath("/rss/channel/itunes:image",namespaces={'itunes': 'http://www.itunes.com/DTDs/Podcast-1.0.dtd'})[0] #if len(image)>0... channel = dom.xpath("/rss/channel") if len(channel): for i in channel[0]: if not(image) and i.tag == "{http://www.itunes.com/dtds/podcast-1.0.dtd}image": self.HTML += self.imgText(i.get("href"), None, None) for i in channel[0]: if i.text and i.text.strip() != "" and isinstance(i.tag, str): thisname = "".join(i.tag.replace("{", "}").split("}")[::2]) # remove {....dtd} from tag self.HTML += "<b>%s:</b> %s\n<br>" % (thisname, i.text) try: self.Title = (dom.xpath("/rss/channel/title")[0].text) except IndexError, e: logging.warn('Error using index ' + str(e))