def get_episodes(url, series): episodes = [] doc = util.get_url_html(url) divs = doc.xpath("//div[contains(@class, 'video')]//div[contains(@class, 'video')]") for div in divs: title = div.xpath('.//span[@class="title"]')[0].text image = div.xpath('.//img')[0].get("src") link = div.xpath('.//a')[0].get("href") episode_text = div.xpath('.//span[@class="episode"]')[0].text episode = {} episode["title"] =title episode["uri"] = link episode["s"] = series episode["e"] = 0 matches = re.search(r"([\d]+)", episode_text) if matches: episode["e"] = matches.group(1) print( episode["s"], episode["e"], episode["title"]) episodes.append(episode) return episodes
def get_listings(): shows = [] for channel in DATA_CHANNELS: print("CHANNEL: ", channel) url = DATA_URL % channel print(url) doc = util.get_url_html(url) shows_div = doc.xpath("//div[contains(@class, 'show')]") print(len(shows_div)) for show in shows_div: url = show.xpath(".//li[contains(@class, 'show')]/a")[0].get( "href") episodes = [] # first episode is in html content and not in xml feed episode = {} episode["show"] = clean_show_name( show.xpath(".//li[contains(@class, 'show')]/a") [0].text.strip()) episode["uri"] = WEBSITE + url episode["type"] = show.xpath( ".//li[contains(@class, 'type')]")[0].text.strip() episode["date"] = show.xpath( ".//li[contains(@class, 'date')]")[0].text.strip() episode["s"] = 0 episode["e"] = 0 details = show.xpath(".//li[contains(@class, 'details')]")[0].text matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details) if matches: episode["s"] = matches.group(1) episode["e"] = matches.group(2) print(episode["s"], episode["e"], episode["show"]) episodes.append(episode) episodes = episodes + get_episodes(url) s = {} s["title"] = clean_show_name(show.xpath(".//h5/a")[0].text.strip()) s["episodes"] = episodes shows.append(s) return shows
def get_listings(): doc = util.get_url_html(DATA_URL) divs = doc.xpath("//div[contains(@class, 'video')]") shows = [] for div in divs: title = div.xpath('.//span[@class="title"]')[0].text image = div.xpath('.//img[@src]')[0].get("src") link = div.xpath('.//a')[0].get("href") series = div.xpath('.//strong')[0].text episode = div.xpath('.//span[@class="episode"]')[0].text print(title) episode_count = 1 matches = re.search(r"([\d]+)", episode) if matches: episode_count = int(matches.group(1)) series = re.search(r"([\d]+)", series).group(1) show = {} if episode_count > 1: episodes = get_episodes(link, series) else: # this is a direct link to an episode episode = {} episode["title"] = title episode["uri"] = link episode["s"] = series episode["e"] = episode_count print( episode["s"], episode["e"], episode["title"]) episodes = [episode] image = div.xpath('.//img')[0].get("src") if episodes: show["title"] = title show["image"] = image show["episodes"] = episodes shows.append(show) return shows
def get_listings(): shows = [] for channel in DATA_CHANNELS: print ("CHANNEL: ", channel) url = DATA_URL % channel print (url) doc = util.get_url_html(url) shows_div = doc.xpath("//div[contains(@class, 'show')]") print( len(shows_div)) for show in shows_div: url = show.xpath(".//li[contains(@class, 'show')]/a")[0].get("href") episodes = [] # first episode is in html content and not in xml feed episode = {} episode["show"] = clean_show_name(show.xpath(".//li[contains(@class, 'show')]/a")[0].text.strip()) episode["uri"] = WEBSITE + url episode["type"] = show.xpath(".//li[contains(@class, 'type')]")[0].text.strip() episode["date"] = show.xpath(".//li[contains(@class, 'date')]")[0].text.strip() episode["s"] = 0 episode["e"] = 0 details = show.xpath(".//li[contains(@class, 'details')]")[0].text matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details) if matches: episode["s"] = matches.group(1) episode["e"] = matches.group(2) print( episode["s"], episode["e"], episode["show"]) episodes.append(episode) episodes = episodes + get_episodes(url) s = {} s["title"] = clean_show_name(show.xpath(".//h5/a")[0].text.strip()) s["episodes"] = episodes shows.append(s) return shows
def get_listings(): url = DATA_URL doc = util.get_url_html(url) divs = doc.xpath("//div[contains(@class, 'grid_2')]") shows = [] for div in divs: link = div.xpath('.//p[@class="artTitle"]/a')[0] title = re.sub( r' S\d+\b', '', link.text.strip() ) show = {} print (title) episodes = get_episodes(link.get("href")) image = div.xpath('.//img')[0].get("src") if episodes: show["title"] = title show["image"] = image show["episodes"] = episodes shows.append(show) return shows
def get_listings(): url = DATA_URL doc = util.get_url_html(url) divs = doc.xpath("//div[contains(@class, 'grid_2')]") shows = [] for div in divs: link = div.xpath('.//p[@class="artTitle"]/a')[0] title = re.sub(r' S\d+\b', '', link.text.strip()) show = {} print(title) episodes = get_episodes(link.get("href")) image = div.xpath('.//img')[0].get("src") if episodes: show["title"] = title show["image"] = image show["episodes"] = episodes shows.append(show) return shows
def get_episodes(href): parts = href.split("/") parts = list(filter(None, parts)) # fastest if len(parts) == 0: return [] url = EPISODE_TEMPLATE % (parts[-2].replace("-", "_")) episodes = [] try: doc = util.get_url_html(url) all_media = doc.xpath("//div[contains(@class, 'listWrapper')]") print(url) for media in all_media: #print etree.tostring(media) # skip headers if len(media.xpath(".//a")) == 0: continue episode = {} episode["show"] = media.xpath(".//a")[0].text episode["uri"] = media.xpath(".//a")[0].get("href") #episode["type"] = media.xpath(".//type")[0].text episode["date"] = media.xpath( ".//div[contains(@class, 'epDetailDate')]")[0].text episode["s"] = 0 episode["e"] = 0 details = media.xpath(".//a")[0].text matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details) if matches: episode["s"] = matches.group(1) episode["e"] = matches.group(2) print(episode["s"], episode["e"], episode["show"]) episodes.append(episode) return episodes except: print("Error getting episode listing") print(url) return
def get_episodes(href): parts = href.split("/") parts = list(filter(None, parts)) # fastest if len(parts) == 0: return [] url = EPISODE_TEMPLATE % (parts[-2].replace("-","_")) episodes = [] try: doc = util.get_url_html(url) all_media = doc.xpath("//div[contains(@class, 'listWrapper')]") print( url) for media in all_media: #print etree.tostring(media) # skip headers if len(media.xpath(".//a")) == 0: continue episode = {} episode["show"] = media.xpath(".//a")[0].text episode["uri"] = media.xpath(".//a")[0].get("href") #episode["type"] = media.xpath(".//type")[0].text episode["date"] = media.xpath(".//div[contains(@class, 'epDetailDate')]")[0].text episode["s"] = 0 episode["e"] = 0 details = media.xpath(".//a")[0].text matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details) if matches: episode["s"] = matches.group(1) episode["e"] = matches.group(2) print( episode["s"], episode["e"], episode["show"]) episodes.append(episode) return episodes except: print ("Error getting episode listing") print (url) return
def get_episodes(product): # ignore sports if product["scodes"] == "Sport": return None, None if product["episodeCount"] > 0 or product["seriesNumber"]: url = EPISODE_TEMPLATE % (getValue(product, "productID")) doc = util.get_url_html(url) scripts = doc.xpath("//script") episodes = [] for script in scripts: key = "isky.bootstrap.episodes = " if str(script.text).startswith(key): episodes_data = json.loads(str(script.text)[len(key):]) if getValue(episodes_data, "productID"): episodes.append(get_details(episodes_data)) else: for data in episodes_data: if getValue(data, "productID"): episodes.append(get_details(data)) return episodes, "tv" # non tv shows as they have no episodes episode = {} episode["show"] = product["productTitle"] episode["title"] = product["productTitle"] episode["uri"] = EPISODE_TEMPLATE % (getValue(product, "productID")) #episode["type"] = media.xpath(".//type")[0].text episode["date"] = product["productTitle"] episode["s"] = 0 episode["e"] = 0 if product["priceCode"]: episode["price"] = product["priceCode"] return [episode], "movie"