Python get_url_html Beispiele, util.get_url_html Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: choicetv.py Projekt: SimonC4/lemontv-scrapers

def get_episodes(url, series):

	episodes = []

	doc = util.get_url_html(url)
	divs = doc.xpath("//div[contains(@class, 'video')]//div[contains(@class, 'video')]")

	for div in divs:
		title = div.xpath('.//span[@class="title"]')[0].text
		image = div.xpath('.//img')[0].get("src")
		link = div.xpath('.//a')[0].get("href")
		episode_text = div.xpath('.//span[@class="episode"]')[0].text
	
		episode = {}
		episode["title"] =title
		episode["uri"] = link
		episode["s"] = series
		episode["e"] = 0	
		matches = re.search(r"([\d]+)", episode_text)
		if matches:
			episode["e"] = matches.group(1)
		
		print( episode["s"], episode["e"], episode["title"])
		
		episodes.append(episode)
	return episodes

Beispiel #2

0

Datei anzeigen

def get_listings():
    shows = []
    for channel in DATA_CHANNELS:
        print("CHANNEL: ", channel)
        url = DATA_URL % channel
        print(url)
        doc = util.get_url_html(url)

        shows_div = doc.xpath("//div[contains(@class, 'show')]")

        print(len(shows_div))
        for show in shows_div:

            url = show.xpath(".//li[contains(@class, 'show')]/a")[0].get(
                "href")
            episodes = []
            # first episode is in html content and not in xml feed
            episode = {}
            episode["show"] = clean_show_name(
                show.xpath(".//li[contains(@class, 'show')]/a")
                [0].text.strip())
            episode["uri"] = WEBSITE + url
            episode["type"] = show.xpath(
                ".//li[contains(@class, 'type')]")[0].text.strip()
            episode["date"] = show.xpath(
                ".//li[contains(@class, 'date')]")[0].text.strip()
            episode["s"] = 0
            episode["e"] = 0
            details = show.xpath(".//li[contains(@class, 'details')]")[0].text
            matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details)

            if matches:
                episode["s"] = matches.group(1)
                episode["e"] = matches.group(2)
            print(episode["s"], episode["e"], episode["show"])

            episodes.append(episode)
            episodes = episodes + get_episodes(url)

            s = {}
            s["title"] = clean_show_name(show.xpath(".//h5/a")[0].text.strip())
            s["episodes"] = episodes
            shows.append(s)

    return shows

Beispiel #3

0

Datei anzeigen

Datei: choicetv.py Projekt: SimonC4/lemontv-scrapers

def get_listings():	
	doc = util.get_url_html(DATA_URL)
	divs = doc.xpath("//div[contains(@class, 'video')]")
	
	shows = []
	for div in divs:
		title = div.xpath('.//span[@class="title"]')[0].text
		image = div.xpath('.//img[@src]')[0].get("src")
		link = div.xpath('.//a')[0].get("href")
		series = div.xpath('.//strong')[0].text
		episode = div.xpath('.//span[@class="episode"]')[0].text
		
		print(title)
		
		episode_count = 1
		matches = re.search(r"([\d]+)", episode)
		if matches:
			episode_count = int(matches.group(1))
			

		series = re.search(r"([\d]+)", series).group(1)

		show = {}
		if episode_count > 1:
			episodes = get_episodes(link, series)
		else:
			# this is a direct link to an episode
			episode = {}
			episode["title"] = title
			episode["uri"] = link
			episode["s"] = series
			episode["e"] = episode_count	
			print( episode["s"], episode["e"], episode["title"])
			episodes = [episode]
			
		image = div.xpath('.//img')[0].get("src")

		if episodes:
			show["title"] = title
			show["image"] = image
			show["episodes"] = episodes
			shows.append(show)

	return shows

Beispiel #4

0

Datei anzeigen

Datei: tvnz.py Projekt: LemonTVnz/lemontv-scrapers

def get_listings():
	shows = []
	for channel in DATA_CHANNELS:
		print ("CHANNEL: ", channel)
		url = DATA_URL % channel
		print (url)
		doc = util.get_url_html(url)
		
		shows_div = doc.xpath("//div[contains(@class, 'show')]")
		
		print( len(shows_div))
		for show in shows_div:
			
			url = show.xpath(".//li[contains(@class, 'show')]/a")[0].get("href")
			episodes = []
			# first episode is in html content and not in xml feed
			episode = {}
			episode["show"] = clean_show_name(show.xpath(".//li[contains(@class, 'show')]/a")[0].text.strip())
			episode["uri"] = WEBSITE + url
			episode["type"] = show.xpath(".//li[contains(@class, 'type')]")[0].text.strip()
			episode["date"] = show.xpath(".//li[contains(@class, 'date')]")[0].text.strip()
			episode["s"] = 0
			episode["e"] = 0			
			details = show.xpath(".//li[contains(@class, 'details')]")[0].text
			matches =  re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details)

			
			if matches:
				episode["s"]  = matches.group(1)
				episode["e"] = matches.group(2)
			print( episode["s"], episode["e"], episode["show"])

			episodes.append(episode)
			episodes = episodes + get_episodes(url)
			
			s = {}
			s["title"] = clean_show_name(show.xpath(".//h5/a")[0].text.strip())
			s["episodes"] = episodes
			shows.append(s)

	return shows

Beispiel #5

0

Datei anzeigen

Datei: tv3.py Projekt: LemonTVnz/lemontv-scrapers

def get_listings():	
	url = DATA_URL
	doc = util.get_url_html(url)
	divs = doc.xpath("//div[contains(@class, 'grid_2')]")
	
	shows = []
	for div in divs:
		link = div.xpath('.//p[@class="artTitle"]/a')[0]
		title = re.sub( r' S\d+\b', '', link.text.strip() )
		show = {}
		print (title)
		episodes = get_episodes(link.get("href"))
		image = div.xpath('.//img')[0].get("src")
		
		if episodes:
			show["title"] = title
			show["image"] = image
			show["episodes"] = episodes
			shows.append(show)

	return shows

Beispiel #6

0

Datei anzeigen

def get_listings():
    url = DATA_URL
    doc = util.get_url_html(url)
    divs = doc.xpath("//div[contains(@class, 'grid_2')]")

    shows = []
    for div in divs:
        link = div.xpath('.//p[@class="artTitle"]/a')[0]
        title = re.sub(r' S\d+\b', '', link.text.strip())
        show = {}
        print(title)
        episodes = get_episodes(link.get("href"))
        image = div.xpath('.//img')[0].get("src")

        if episodes:
            show["title"] = title
            show["image"] = image
            show["episodes"] = episodes
            shows.append(show)

    return shows

Beispiel #7

0

Datei anzeigen

def get_episodes(href):
    parts = href.split("/")
    parts = list(filter(None, parts))  # fastest
    if len(parts) == 0:
        return []

    url = EPISODE_TEMPLATE % (parts[-2].replace("-", "_"))

    episodes = []
    try:
        doc = util.get_url_html(url)
        all_media = doc.xpath("//div[contains(@class, 'listWrapper')]")
        print(url)
        for media in all_media:
            #print etree.tostring(media)

            # skip headers
            if len(media.xpath(".//a")) == 0:
                continue

            episode = {}
            episode["show"] = media.xpath(".//a")[0].text
            episode["uri"] = media.xpath(".//a")[0].get("href")
            #episode["type"] = media.xpath(".//type")[0].text
            episode["date"] = media.xpath(
                ".//div[contains(@class, 'epDetailDate')]")[0].text
            episode["s"] = 0
            episode["e"] = 0
            details = media.xpath(".//a")[0].text
            matches = re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details)
            if matches:
                episode["s"] = matches.group(1)
                episode["e"] = matches.group(2)
            print(episode["s"], episode["e"], episode["show"])
            episodes.append(episode)
        return episodes
    except:
        print("Error getting episode listing")
        print(url)
        return

Beispiel #8

0

Datei anzeigen

Datei: tv3.py Projekt: LemonTVnz/lemontv-scrapers

def get_episodes(href):
	parts = href.split("/")
	parts = list(filter(None, parts)) # fastest
	if len(parts) == 0:
		return []
	
	url = EPISODE_TEMPLATE % (parts[-2].replace("-","_"))
	
	episodes = []
	try:
		doc = util.get_url_html(url)
		all_media = doc.xpath("//div[contains(@class, 'listWrapper')]")
		print( url)
		for media in all_media:
			#print etree.tostring(media)

			# skip headers
			if len(media.xpath(".//a")) == 0:
				continue
		
			episode = {}
			episode["show"] = media.xpath(".//a")[0].text
			episode["uri"] = media.xpath(".//a")[0].get("href")
			#episode["type"] = media.xpath(".//type")[0].text
			episode["date"] = media.xpath(".//div[contains(@class, 'epDetailDate')]")[0].text
			episode["s"] = 0
			episode["e"] = 0			
			details = media.xpath(".//a")[0].text
			matches =  re.search(r"[^\d]+(\d+)[^\d]+(\d+)", details)
			if matches:
				episode["s"]  = matches.group(1)
				episode["e"] = matches.group(2)
			print( episode["s"], episode["e"], episode["show"])
			episodes.append(episode)
		return episodes
	except:
		print ("Error getting episode listing")
		print (url)
		return

Beispiel #9

0

Datei anzeigen

def get_episodes(product):

    # ignore sports
    if product["scodes"] == "Sport":
        return None, None

    if product["episodeCount"] > 0 or product["seriesNumber"]:
        url = EPISODE_TEMPLATE % (getValue(product, "productID"))

        doc = util.get_url_html(url)

        scripts = doc.xpath("//script")
        episodes = []

        for script in scripts:
            key = "isky.bootstrap.episodes = "
            if str(script.text).startswith(key):
                episodes_data = json.loads(str(script.text)[len(key):])
                if getValue(episodes_data, "productID"):
                    episodes.append(get_details(episodes_data))
                else:
                    for data in episodes_data:
                        if getValue(data, "productID"):
                            episodes.append(get_details(data))
        return episodes, "tv"

    # non tv shows as they have no episodes
    episode = {}
    episode["show"] = product["productTitle"]
    episode["title"] = product["productTitle"]
    episode["uri"] = EPISODE_TEMPLATE % (getValue(product, "productID"))
    #episode["type"] = media.xpath(".//type")[0].text
    episode["date"] = product["productTitle"]
    episode["s"] = 0
    episode["e"] = 0
    if product["priceCode"]:
        episode["price"] = product["priceCode"]
    return [episode], "movie"